arm_compute v19.02
Change-Id: I853a3ecf38f206da13c1b03640c8adf73c20477c
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 0947d58..18ef185 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -148,7 +148,7 @@
const GPUTarget gpu_target = get_target_from_name(device_name);
// SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8
- std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76};
+ std::set<GPUTarget> sw_workaround_issue = { GPUTarget::G76 };
return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
}
@@ -230,4 +230,29 @@
return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end());
}
}
+
+size_t preferred_vector_width(const cl::Device &device, const DataType dt)
+{
+ switch(dt)
+ {
+ case DataType::U8:
+ case DataType::S8:
+ case DataType::QASYMM8:
+ return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR>();
+ case DataType::U16:
+ case DataType::S16:
+ return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT>();
+ case DataType::U32:
+ case DataType::S32:
+ return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT>();
+ case DataType::F16:
+ case DataType::F32:
+ return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>();
+ case DataType::U64:
+ case DataType::S64:
+ return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG>();
+ default:
+ return 1;
+ }
+}
} // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index ff4803e..4ecb885 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -149,11 +149,7 @@
{ "accumulate_weighted", "accumulate.cl" },
{ "activation_layer", "activation_layer.cl" },
{ "activation_layer_qa8", "activation_layer_qa8.cl" },
- { "arithmetic_add_quantized", "arithmetic_op_quantized.cl" },
- { "arithmetic_add", "arithmetic_op.cl" },
- { "arithmetic_sub", "arithmetic_op.cl" },
- { "arithmetic_sub_quantized", "arithmetic_op_quantized.cl" },
- { "arithmetic_div", "arithmetic_op.cl" },
+ { "activation_layer_logistic_qa8", "activation_layer_qa8.cl" },
{ "batch_to_space_nchw", "batch_to_space.cl" },
{ "batch_to_space_static_nchw", "batch_to_space.cl" },
{ "batch_to_space_nhwc", "batch_to_space.cl" },
@@ -180,6 +176,18 @@
{ "channel_extract_YUYV422", "channel_extract.cl" },
{ "combine_gradients_L1", "canny.cl" },
{ "combine_gradients_L2", "canny.cl" },
+ { "compare_equal", "comparisons.cl" },
+ { "compare_equal_quantized", "comparisons.cl" },
+ { "compare_notequal", "comparisons.cl" },
+ { "compare_notequal_quantized", "comparisons.cl" },
+ { "compare_greater", "comparisons.cl" },
+ { "compare_greater_quantized", "comparisons.cl" },
+ { "compare_greaterequal", "comparisons.cl" },
+ { "compare_greaterequal_quantized", "comparisons.cl" },
+ { "compare_less", "comparisons.cl" },
+ { "compare_less_quantized", "comparisons.cl" },
+ { "compare_lessequal", "comparisons.cl" },
+ { "compare_lessequal_quantized", "comparisons.cl" },
{ "concatenate_depth", "concatenate.cl" },
{ "concatenate_width", "concatenate.cl" },
{ "concatenate_width_x2", "concatenate.cl" },
@@ -218,9 +226,10 @@
{ "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl" },
+ { "depthwise_convolution_reshape_weights", "depthwise_convolution.cl" },
+ { "depthwise_convolution_reshape_weights_generic", "depthwise_convolution.cl" },
{ "depthwise_im2col", "depthwise_convolution.cl" },
{ "depthwise_vector_to_tensor", "depthwise_convolution.cl" },
- { "depthwise_weights_reshape", "depthwise_convolution.cl" },
{ "dequantization_layer", "dequantization_layer.cl" },
{ "derivative", "derivative.cl" },
{ "dilate", "dilate.cl" },
@@ -234,6 +243,19 @@
{ "direct_convolution5x5_nhwc", "direct_convolution5x5.cl" },
{ "direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl" },
{ "direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
+ { "elementwise_operation_ADD", "elementwise_operation.cl" },
+ { "elementwise_operation_SUB", "elementwise_operation.cl" },
+ { "elementwise_operation_MAX", "elementwise_operation.cl" },
+ { "elementwise_operation_MIN", "elementwise_operation.cl" },
+ { "elementwise_operation_DIV", "elementwise_operation.cl" },
+ { "elementwise_operation_SQUARED_DIFF", "elementwise_operation.cl" },
+ { "elementwise_operation_ADD_quantized", "elementwise_operation_quantized.cl" },
+ { "elementwise_operation_SUB_quantized", "elementwise_operation_quantized.cl" },
+ { "elementwise_operation_MAX_quantized", "elementwise_operation_quantized.cl" },
+ { "elementwise_operation_MIN_quantized", "elementwise_operation_quantized.cl" },
+ { "elementwise_operation_DIV_quantized", "elementwise_operation_quantized.cl" },
+ { "elementwise_operation_SQUARED_DIFF_quantized", "elementwise_operation_quantized.cl" },
+ { "elementwise_unary", "elementwise_unary.cl" },
{ "erode", "erode.cl" },
{ "fast_corners", "fast_corners.cl" },
{ "flatten", "flatten.cl" },
@@ -242,6 +264,7 @@
{ "finalize", "optical_flow_pyramid_lk.cl" },
{ "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
{ "floor_layer", "floor.cl" },
+ { "gather", "gather.cl" },
{ "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
{ "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
{ "gemm_accumulate_biases", "gemm.cl" },
@@ -260,8 +283,13 @@
{ "gemm_mm_floating_point_f16_bifrost_acc32", "gemm.cl" },
{ "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
{ "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
+ { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" },
{ "gemm_lc_vm_f32", "gemm.cl" },
{ "gemm_transpose1xW", "gemm.cl" },
+ { "gemm_reshape_lhs_matrix_nt", "gemm.cl" },
+ { "gemm_reshape_lhs_matrix_t", "gemm.cl" },
+ { "gemm_reshape_rhs_matrix_nt", "gemm.cl" },
+ { "gemm_reshape_rhs_matrix_t", "gemm.cl" },
{ "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
{ "gemmlowp_matrix_a_reduction_dot8", "gemmlowp.cl" },
{ "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
@@ -271,6 +299,8 @@
{ "gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl" },
{ "gemmlowp_mm_interleaved_transposed_bifrost_dot8", "gemmlowp.cl" },
{ "gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl" },
+ { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "gemmlowp.cl" },
+ { "gemmlowp_mm_reshaped_lhs_nt_rhs_t_dot8", "gemmlowp.cl" },
{ "gemmlowp_offset_contribution", "gemmlowp.cl" },
{ "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" },
{ "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" },
@@ -296,6 +326,7 @@
{ "im2col_generic_nchw", "im2col.cl" },
{ "im2col_generic_padx0_pady0_nchw", "im2col.cl" },
{ "im2col3x3_nhwc", "im2col.cl" },
+ { "im2col9x9_nhwc", "im2col.cl" },
{ "im2col_generic_nhwc", "im2col.cl" },
{ "init_level", "optical_flow_pyramid_lk.cl" },
{ "init_level_max", "optical_flow_pyramid_lk.cl" },
@@ -326,7 +357,8 @@
{ "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
{ "non_max_suppression", "nonmax.cl" },
{ "normalization_layer_cross_map", "normalization_layer.cl" },
- { "normalization_layer_in_map", "normalization_layer.cl" },
+ { "normalization_layer_in_map_nchw", "normalization_layer.cl" },
+ { "normalization_layer_in_map_nhwc", "normalization_layer.cl" },
{ "normalize_planar_yuv_layer_nchw", "normalize_planar_yuv_layer.cl" },
{ "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
{ "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
@@ -340,9 +372,7 @@
{ "NV21_to_RGBA8888_bt709", "color_convert.cl" },
{ "NV21_to_YUV444_bt709", "color_convert.cl" },
{ "output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
- { "permute_201", "permute.cl" },
- { "permute_120", "permute.cl" },
- { "permute_3201", "permute.cl" },
+ { "permute", "permute.cl" },
{ "pixelwise_mul_float", "pixelwise_mul_float.cl" },
{ "pixelwise_mul_int", "pixelwise_mul_int.cl" },
{ "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
@@ -355,10 +385,11 @@
{ "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
{ "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
{ "prior_box_layer_nchw", "prior_box_layer.cl" },
- { "prior_box_layer_nhwc", "prior_box_layer.cl" },
{ "quantization_layer", "quantization_layer.cl" },
+ { "range", "range.cl" },
+ { "range_quantized", "range.cl" },
{ "reduction_operation_x", "reduction_operation.cl" },
- { "reduction_operation_quantized_x", "reduction_operation.cl" },
+ { "reduction_operation_non_parallel_x", "reduction_operation.cl" },
{ "reduction_operation_y", "reduction_operation.cl" },
{ "reduction_operation_z", "reduction_operation.cl" },
{ "reduction_operation_w", "reduction_operation.cl" },
@@ -368,6 +399,7 @@
{ "reorg_layer_nhwc", "reorg_layer.cl" },
{ "reshape_layer", "reshape_layer.cl" },
{ "reshape_to_columns", "convolution_layer.cl" },
+ { "reverse", "reverse.cl" },
{ "RGB888_to_IYUV_bt709", "color_convert.cl" },
{ "RGB888_to_NV12_bt709", "color_convert.cl" },
{ "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
@@ -386,6 +418,9 @@
{ "scale_bilinear_quantized_nchw", "scale_quantized.cl" },
{ "scale_bilinear_quantized_nhwc", "scale_quantized.cl" },
{ "scharr3x3", "scharr_filter.cl" },
+ { "select_same_rank", "select.cl" },
+ { "select_different_rank_2", "select.cl" },
+ { "select_different_rank_n", "select.cl" },
{ "sobel3x3", "sobel_filter.cl" },
{ "sobel_separable5x1", "sobel_filter.cl" },
{ "sobel_separable1x5", "sobel_filter.cl" },
@@ -401,12 +436,14 @@
{ "space_to_batch_nhwc", "space_to_batch.cl" },
{ "space_to_batch_static_nhwc", "space_to_batch.cl" },
{ "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
+ { "stack_layer", "stack_layer.cl" },
{ "strided_slice", "slice_ops.cl" },
{ "suppress_non_maximum", "canny.cl" },
{ "tablelookup_U8", "tablelookup.cl" },
{ "tablelookup_S16", "tablelookup.cl" },
{ "threshold_binary", "threshold.cl" },
{ "threshold_range", "threshold.cl" },
+ { "tile", "tile.cl" },
{ "transpose", "transpose.cl" },
{ "UYVY422_to_IYUV_bt709", "color_convert.cl" },
{ "UYVY422_to_NV12_bt709", "color_convert.cl" },
@@ -494,14 +531,6 @@
#include "./cl_kernels/activation_layer_qa8.clembed"
},
{
- "arithmetic_op.cl",
-#include "./cl_kernels/arithmetic_op.clembed"
- },
- {
- "arithmetic_op_quantized.cl",
-#include "./cl_kernels/arithmetic_op_quantized.clembed"
- },
- {
"batch_to_space.cl",
#include "./cl_kernels/batch_to_space.clembed"
},
@@ -534,6 +563,10 @@
#include "./cl_kernels/col2im.clembed"
},
{
+ "comparisons.cl",
+#include "./cl_kernels/comparisons.clembed"
+ },
+ {
"concatenate.cl",
#include "./cl_kernels/concatenate.clembed"
},
@@ -622,6 +655,18 @@
#include "./cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.clembed"
},
{
+ "elementwise_operation.cl",
+#include "./cl_kernels/elementwise_operation.clembed"
+ },
+ {
+ "elementwise_operation_quantized.cl",
+#include "./cl_kernels/elementwise_operation_quantized.clembed"
+ },
+ {
+ "elementwise_unary.cl",
+#include "./cl_kernels/elementwise_unary.clembed"
+ },
+ {
"erode.cl",
#include "./cl_kernels/erode.clembed"
},
@@ -642,6 +687,10 @@
#include "./cl_kernels/floor.clembed"
},
{
+ "gather.cl",
+#include "./cl_kernels/gather.clembed"
+ },
+ {
"gaussian_pyramid.cl",
#include "./cl_kernels/gaussian_pyramid.clembed"
},
@@ -778,6 +827,10 @@
#include "./cl_kernels/quantization_layer.clembed"
},
{
+ "range.cl",
+#include "./cl_kernels/range.clembed"
+ },
+ {
"reduction_operation.cl",
#include "./cl_kernels/reduction_operation.clembed"
},
@@ -794,6 +847,10 @@
#include "./cl_kernels/reshape_layer.clembed"
},
{
+ "reverse.cl",
+#include "./cl_kernels/reverse.clembed"
+ },
+ {
"roi_align_layer.cl",
#include "./cl_kernels/roi_align_layer.clembed"
},
@@ -814,6 +871,10 @@
#include "./cl_kernels/scharr_filter.clembed"
},
{
+ "select.cl",
+#include "./cl_kernels/select.clembed"
+ },
+ {
"sobel_filter.cl",
#include "./cl_kernels/sobel_filter.clembed"
},
@@ -834,6 +895,10 @@
#include "./cl_kernels/space_to_batch.clembed"
},
{
+ "stack_layer.cl",
+#include "./cl_kernels/stack_layer.clembed"
+ },
+ {
"tablelookup.cl",
#include "./cl_kernels/tablelookup.clembed"
},
@@ -842,6 +907,10 @@
#include "./cl_kernels/threshold.clembed"
},
{
+ "tile.cl",
+#include "./cl_kernels/tile.clembed"
+ },
+ {
"transpose.cl",
#include "./cl_kernels/transpose.clembed"
},
diff --git a/src/core/CL/cl_kernels/activation_helpers.h b/src/core/CL/cl_kernels/activation_helpers.h
index dfab082..9d4af84 100644
--- a/src/core/CL/cl_kernels/activation_helpers.h
+++ b/src/core/CL/cl_kernels/activation_helpers.h
@@ -70,7 +70,7 @@
// Soft RELU Activation
inline TYPE srelu_op(TYPE x)
{
- return LOG_OP(ADD_OP((TYPE)CONST_ONE, EXP_OP(x)));
+ return CONVERT(LOG_OP(ADD_OP((VEC_DATA_TYPE(float, VEC_SIZE))CONST_ONE, EXP_OP(CONVERT(x, VEC_DATA_TYPE(float, VEC_SIZE))))), TYPE);
}
// Absolute Activation
inline TYPE abs_op(TYPE x)
diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
index 8f6a807..cfb6137 100644
--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,16 +26,6 @@
#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-// Logistic Activation
-inline TYPE logistic_op(TYPE x)
-{
- VEC_FLOAT x_flt = CONVERT(x, VEC_FLOAT);
- x_flt = round(x_flt - (float)O1_VAL) * ((float)S1_VAL);
- x_flt = 1.f / (1.f + exp(-x_flt));
-
- const TYPE x_u8 = CONVERT_SAT(round(x_flt / ((float)S1_VAL)) + (float)O1_VAL, TYPE);
- return x_u8;
-}
// RELU Activation
inline TYPE relu_op(TYPE x)
{
@@ -95,14 +85,14 @@
* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
*/
__kernel void activation_layer_qa8(
TENSOR3D_DECLARATION(input)
@@ -131,3 +121,69 @@
}
#endif /* defined(ACT) */
+
+#if defined(O2_VAL) && defined(S2_VAL)
+#define OFFSET_OUT O2_VAL
+#define SCALE_OUT S2_VAL
+#else // defined(O2_VAL) && defined(S2_VAL)
+#define OFFSET_OUT O1_VAL
+#define SCALE_OUT S1_VAL
+#endif // defined(O2_VAL) && defined(S2_VAL)
+
+/** This performs a Logistic activation function on QASYMM8 inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_logistic_qa8(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load data
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+ VEC_FLOAT data_flt = CONVERT(data, VEC_FLOAT);
+ data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
+ data_flt = 1.f / (1.f + exp(-data_flt));
+
+ data = CONVERT_SAT(round(data_flt / ((float)SCALE_OUT)) + (float)OFFSET_OUT, TYPE);
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (data, 0, (__global DATA_TYPE *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
deleted file mode 100644
index 557615e..0000000
--- a/src/core/CL/cl_kernels/arithmetic_op.cl
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-#define DIV(x, y) (x) / (y)
-
-#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
-/** This function adds two tensors.
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
- * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
- * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
- * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_add(
- TENSOR3D_DECLARATION(in1),
- TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out))
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load values
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
- in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
- VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
- in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-
- // Calculate and store result
- VSTORE(VEC_SIZE)
- (ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
-
-/** This function subtracts one tensor from another.
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- *
- * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16
- * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16
- * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16
- * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_sub(
- TENSOR3D_DECLARATION(in1),
- TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out))
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load values
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-
- // Calculate and store result
- vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
-
-/** This function divides one tensor from another.
- *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=float -DDATA_TYPE_IN2=float -DDATA_TYPE_OUT=float
- *
- * @param[in] in1_ptr Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] in2_ptr Pointer to the source tensor. Supported data types: Same as @p in1_ptr
- * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr Pointer to the destination tensor. Supported data types: Same as @p in1_ptr
- * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void arithmetic_div(
- TENSOR3D_DECLARATION(in1),
- TENSOR3D_DECLARATION(in2),
- TENSOR3D_DECLARATION(out))
-{
- // Get pixels pointer
- Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
- Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
- // Load values
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-
- // Calculate and store result
- vstore16(DIV(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/bounding_box_transform.cl
index 0972355..e6f470a 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform.cl
+++ b/src/core/CL/cl_kernels/bounding_box_transform.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/src/core/CL/cl_kernels/comparisons.cl
similarity index 68%
rename from src/core/CL/cl_kernels/arithmetic_op_quantized.cl
rename to src/core/CL/cl_kernels/comparisons.cl
index fc7fa77..8824b13 100644
--- a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ b/src/core/CL/cl_kernels/comparisons.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,37 +23,27 @@
*/
#include "helpers.h"
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
+#define EQUAL(x, y) ((x) == (y))
+#define NOTEQUAL(x, y) ((x) != (y))
+#define GREATER(x, y) ((x) > (y))
+#define GREATEREQUAL(x, y) ((x) >= (y))
+#define LESS(x, y) ((x) < (y))
+#define LESSEQUAL(x, y) ((x) <= (y))
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define DEFINE_KERNEL_STR(name) compare_##name
+#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+#define DEFINE_KERNEL_QUANTIZED_STR(name) compare_##name##_quantized
+#define DEFINE_KERNEL_QUANTIZED(name) DEFINE_KERNEL_QUANTIZED_STR(name)
-#if defined(VEC_SIZE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
-
-/** This function adds two tensors.
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
+/** This function compares two tensors.
*
- * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
*
- * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
* @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -61,7 +51,7 @@
* @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] in2_ptr Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
* @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -69,7 +59,7 @@
* @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
* @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
@@ -78,7 +68,7 @@
* @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void arithmetic_add_quantized(
+__kernel void DEFINE_KERNEL(OP_NAME)(
TENSOR3D_DECLARATION(in1),
TENSOR3D_DECLARATION(in2),
TENSOR3D_DECLARATION(out))
@@ -88,33 +78,25 @@
Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
- VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
- VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
+ // Load values
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ in_a = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ in_b = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2.ptr);
- in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
- in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
-
- const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
- const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-
- const VEC_FLOAT qresf32 = (in1f32 + in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
- const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
-
- // Store result
+ // Calculate and store result
VSTORE(VEC_SIZE)
- (res, 0, (__global uchar *)out.ptr);
+ (CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)out.ptr);
}
-#endif /* defined(VEC_SIZE) */
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
-/** This function subtracts two tensors.
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
+/** This function compares two quantized tensors.
*
* @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
* @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
* @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
* @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
*
* @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8
* @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -141,7 +123,7 @@
* @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void arithmetic_sub_quantized(
+__kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
TENSOR3D_DECLARATION(in1),
TENSOR3D_DECLARATION(in2),
TENSOR3D_DECLARATION(out))
@@ -154,15 +136,14 @@
int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
- in_a = SUB(in_a, (int16)((int)OFFSET_IN1));
- in_b = SUB(in_b, (int16)((int)OFFSET_IN2));
+ in_a = in_a - (int16)((int)OFFSET_IN1);
+ in_b = in_b - (int16)((int)OFFSET_IN2);
- const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
- const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
- const float16 qresf32 = (in1f32 - in2f32) / ((float16)(float)SCALE_OUT) + ((float16)((float16)OFFSET_OUT));
- const uchar16 res = convert_uchar16_sat(convert_int16_rte(qresf32));
+ const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
+ const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
+ const int16 res = OP(in1f32, in2f32);
// Store result
- vstore16(res, 0, (__global uchar *)out.ptr);
+ vstore16(convert_uchar16(res), 0, (__global uchar *)out.ptr);
}
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 0e8805f..c374769 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,22 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(VEC_SIZE)
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+inline VEC_UCHAR requantize(VEC_UCHAR input, float in_offset, float out_offset, float in_scale, float out_scale)
+{
+ const VEC_FLOAT in_f32 = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
+ const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
+ const VEC_UCHAR res_u8 = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_UCHAR);
+ return res_u8;
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
#if defined(DEPTH) && defined(ELEMENT_SIZE)
#if defined(INPUT1_WIDTH)
@@ -50,6 +64,7 @@
#else // VEC_SIZE
#error "Vector size not supported"
#endif // VEC_SIZE
+
/** This kernel concatenates two input tensors into the output tensor along the first dimension
*
* @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
@@ -88,11 +103,15 @@
* @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src1_pad_right Right paddings of the first input tensor in unit of elements
+ * @param[in] src1_pad_left Left paddings of the second input tensor in unit of elements
*/
__kernel void concatenate_width_x2(
TENSOR4D_DECLARATION(src1),
TENSOR4D_DECLARATION(src2),
- TENSOR4D_DECLARATION(dst))
+ TENSOR4D_DECLARATION(dst),
+ uint src1_pad_right,
+ uint src2_pad_left)
{
Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
@@ -101,16 +120,22 @@
const int y = get_global_id(1);
const int z = get_global_id(2) % (int)DEPTH;
const int w = get_global_id(2) / (int)DEPTH;
- const int x1 = min(x, (int)INPUT1_WIDTH);
- const int x2 = max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE);
+ const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
+ const int x2 = max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left);
// Calculate inputs and output addresses
const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2)
+ src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+ src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) */
const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values = select(src2_values, src1_values, cond);
@@ -180,13 +205,25 @@
* @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src1_pad_right Right paddings of the first input tensor in unit of elements
+ * @param[in] src2_pad_left Left paddings of the second input tensor in unit of elements
+ * @param[in] src2_pad_right Right paddings of the second input tensor in unit of elements
+ * @param[in] src3_pad_left Left paddings of the third input tensor in unit of elements
+ * @param[in] src3_pad_right Right paddings of the third input tensor in unit of elements
+ * @param[in] src4_pad_left Left paddings of the fourth input tensor in unit of elements
*/
__kernel void concatenate_width_x4(
TENSOR4D_DECLARATION(src1),
TENSOR4D_DECLARATION(src2),
TENSOR4D_DECLARATION(src3),
TENSOR4D_DECLARATION(src4),
- TENSOR4D_DECLARATION(dst))
+ TENSOR4D_DECLARATION(dst),
+ uint src1_pad_right,
+ uint src2_pad_left,
+ uint src2_pad_right,
+ uint src3_pad_left,
+ uint src3_pad_right,
+ uint src4_pad_left)
{
Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
@@ -196,10 +233,10 @@
const int z = get_global_id(2) % (int)DEPTH;
const int w = get_global_id(2) / (int)DEPTH;
- const int x1 = min(x, (int)INPUT1_WIDTH);
- const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE), (int)INPUT2_WIDTH);
- const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)VEC_SIZE), (int)INPUT3_WIDTH);
- const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)VEC_SIZE);
+ const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
+ const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left), (int)INPUT2_WIDTH + (int)src2_pad_right - (int)VEC_SIZE);
+ const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)src3_pad_left), (int)INPUT3_WIDTH + (int)src3_pad_right - (int)VEC_SIZE);
+ const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)src4_pad_left);
// Calculate inputs and output addresses
const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
@@ -207,10 +244,21 @@
const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
+ src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+ src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+ src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
+ src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
@@ -259,6 +307,7 @@
* @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
+
__kernel void concatenate_width(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst))
@@ -269,9 +318,16 @@
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+ const VEC_UCHAR out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+ VSTORE(VEC_SIZE)
+ (out, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
+#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
VSTORE(VEC_SIZE)
(source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
}
+
#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
/** This kernel concatenates the input tensor into the output tensor along the third dimension
@@ -308,7 +364,12 @@
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, -offsets.x, -offsets.y, 0));
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+ source_values = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
VSTORE(VEC_SIZE)
(source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z));
+
}
#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
index 8c75ecd..625c6c4 100644
--- a/src/core/CL/cl_kernels/convolution3x3.cl
+++ b/src/core/CL/cl_kernels/convolution3x3.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,7 +72,6 @@
* @param[in] mat4 Coefficient from the convolution matrix
* @param[in] mat5 Coefficient from the convolution matrix
* @param[in] mat6 Coefficient from the convolution matrix
- * @param[in] mat0 Coefficient from the convolution matrix
* @param[in] mat7 Coefficient from the convolution matrix
* @param[in] mat8 Coefficient from the convolution matrix
* @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
index 605cd09..2c3cafa 100644
--- a/src/core/CL/cl_kernels/convolution5x5.cl
+++ b/src/core/CL/cl_kernels/convolution5x5.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -132,7 +132,6 @@
* @param[in] mat4 Coefficient from the convolution matrix
* @param[in] mat5 Coefficient from the convolution matrix
* @param[in] mat6 Coefficient from the convolution matrix
- * @param[in] mat0 Coefficient from the convolution matrix
* @param[in] mat7 Coefficient from the convolution matrix
* @param[in] mat8 Coefficient from the convolution matrix
* @param[in] mat9 Coefficient from the convolution matrix
@@ -143,7 +142,6 @@
* @param[in] mat14 Coefficient from the convolution matrix
* @param[in] mat15 Coefficient from the convolution matrix
* @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
* @param[in] mat17 Coefficient from the convolution matrix
* @param[in] mat18 Coefficient from the convolution matrix
* @param[in] mat19 Coefficient from the convolution matrix
diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
index 1abfb15..9dd6a88 100644
--- a/src/core/CL/cl_kernels/convolution7x7.cl
+++ b/src/core/CL/cl_kernels/convolution7x7.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -152,7 +152,6 @@
* @param[in] mat4 Coefficient from the convolution matrix
* @param[in] mat5 Coefficient from the convolution matrix
* @param[in] mat6 Coefficient from the convolution matrix
- * @param[in] mat0 Coefficient from the convolution matrix
* @param[in] mat7 Coefficient from the convolution matrix
* @param[in] mat8 Coefficient from the convolution matrix
* @param[in] mat9 Coefficient from the convolution matrix
@@ -163,7 +162,6 @@
* @param[in] mat14 Coefficient from the convolution matrix
* @param[in] mat15 Coefficient from the convolution matrix
* @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
* @param[in] mat17 Coefficient from the convolution matrix
* @param[in] mat18 Coefficient from the convolution matrix
* @param[in] mat19 Coefficient from the convolution matrix
diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
index f537326..2a5f4a1 100644
--- a/src/core/CL/cl_kernels/convolution9x9.cl
+++ b/src/core/CL/cl_kernels/convolution9x9.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -173,7 +173,6 @@
* @param[in] mat4 Coefficient from the convolution matrix
* @param[in] mat5 Coefficient from the convolution matrix
* @param[in] mat6 Coefficient from the convolution matrix
- * @param[in] mat0 Coefficient from the convolution matrix
* @param[in] mat7 Coefficient from the convolution matrix
* @param[in] mat8 Coefficient from the convolution matrix
* @param[in] mat9 Coefficient from the convolution matrix
@@ -184,7 +183,6 @@
* @param[in] mat14 Coefficient from the convolution matrix
* @param[in] mat15 Coefficient from the convolution matrix
* @param[in] mat16 Coefficient from the convolution matrix
- * @param[in] mat10 Coefficient from the convolution matrix
* @param[in] mat17 Coefficient from the convolution matrix
* @param[in] mat18 Coefficient from the convolution matrix
* @param[in] mat19 Coefficient from the convolution matrix
@@ -245,7 +243,6 @@
* @param[in] mat74 Coefficient from the convolution matrix
* @param[in] mat75 Coefficient from the convolution matrix
* @param[in] mat76 Coefficient from the convolution matrix
- * @param[in] mat76 Coefficient from the convolution matrix
* @param[in] mat77 Coefficient from the convolution matrix
* @param[in] mat78 Coefficient from the convolution matrix
* @param[in] mat79 Coefficient from the convolution matrix
diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
index 611449e..75192e6 100644
--- a/src/core/CL/cl_kernels/depth_convert.cl
+++ b/src/core/CL/cl_kernels/depth_convert.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,80 +38,92 @@
/** This function performs a down-scaling depth conversion.
*
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
*
* @param[in] in_ptr Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in_step_y in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in_step_z in_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
* @param[out] out_ptr Pointer to the destination image. Supported data types: U8/U16/S16/U32/S32/F16/F32
* @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
* @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
* @param[in] shift The integer shift amount value. Supported data types: S32
*/
__kernel void convert_depth_down(
- IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
+ TENSOR3D_DECLARATION(in),
+ TENSOR3D_DECLARATION(out),
const int shift)
{
// Get pixels pointer
- Image in = CONVERT_TO_IMAGE_STRUCT(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load data
- VEC_DATA_TYPE(DATA_TYPE_IN, 16)
- in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+ VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+ in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
#if defined(IS_DATA_TYPE_FLOAT)
- const DATA_TYPE_IN scale = (DATA_TYPE_IN)(1 << shift);
- vstore16(CONVERT_DOWN(in_data / scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+ VSTORE(VEC_SIZE)
+ (CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
#else /* defined(IS_DATA_TYPE_FLOAT) */
- vstore16(CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+ VSTORE(VEC_SIZE)
+ (CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
#endif /* defined(IS_DATA_TYPE_FLOAT) */
}
/** This function performs a up-scaling depth conversion.
*
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
*
* @param[in] in_ptr Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in_step_y in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in_step_z in_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
* @param[out] out_ptr Pointer to the destination image. Supported data types: U8/U16/S16/U32/S32/F16/F32
* @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
* @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
* @param[in] shift The integer shift amount value. Supported data types: S32
*/
__kernel void convert_depth_up(
- IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
+ TENSOR3D_DECLARATION(in),
+ TENSOR3D_DECLARATION(out),
const int shift)
{
// Get pixels pointer
- Image in = CONVERT_TO_IMAGE_STRUCT(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load data
- VEC_DATA_TYPE(DATA_TYPE_IN, 16)
- in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+ VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+ in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
#if defined(IS_DATA_TYPE_FLOAT)
- const DATA_TYPE_OUT scale = (DATA_TYPE_OUT)(1 << shift);
- vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)) * scale, 0, (__global DATA_TYPE_OUT *)out.ptr);
+ VSTORE(VEC_SIZE)
+ (CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
#else /* defined(IS_DATA_TYPE_FLOAT) */
- vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)) << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+ VSTORE(VEC_SIZE)
+ (CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)) << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
#endif /* defined(IS_DATA_TYPE_FLOAT) */
}
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index bfaa92b..4f6fdfa 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -464,6 +464,104 @@
#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
+/** Reshape the weights for quantized depthwise convolution
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
+ * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
+ * @attention Input's height and width should be 3
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depthwise_convolution_reshape_weights(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
+{
+ Vector src = CONVERT_TO_VECTOR_STRUCT(src);
+ const int x = get_global_id(0);
+
+ // Load 3x3xVEC_SIZE weights
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
+
+#if defined(TRANSPOSE)
+#if VEC_SIZE != 4
+#error "VEC_SIZE not supported"
+#else // VEC_SIZE != 4
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
+#endif // VEC_SIZE != 4
+#else // !defined(TRANSPOSE)
+ VSTORE(VEC_SIZE)
+ (w0, 0, dst_addr + 0);
+ VSTORE(VEC_SIZE)
+ (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
+ VSTORE(VEC_SIZE)
+ (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
+#endif // defined(TRANSPOSE)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
+
#if defined(NCHW)
#define in_stride_x src_stride_x
#define in_stride_y src_stride_y
@@ -504,7 +602,7 @@
* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_weights_reshape(
+__kernel void depthwise_convolution_reshape_weights_generic(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst)
#ifdef HAS_BIAS
@@ -1091,9 +1189,9 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
@@ -1240,9 +1338,9 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
@@ -1394,4 +1492,4 @@
}
#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
\ No newline at end of file
+#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 5a732b4..606af2e 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -574,62 +574,25 @@
#endif /* WEIGHTS_OFFSET != 0 */
#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
- ({ \
- ARM_DOT((uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), (uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), acc.s0); \
- ARM_DOT((uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), (uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), acc.s0); \
- acc.s0 += val8.s0 * w8.s0; \
- \
- ARM_DOT((uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), (uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), acc.s1); \
- ARM_DOT((uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), (uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), acc.s1); \
- acc.s1 += val8.s1 * w8.s1; \
- \
- ARM_DOT((uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), (uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), acc.s2); \
- ARM_DOT((uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), (uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), acc.s2); \
- acc.s2 += val8.s2 * w8.s2; \
- \
- ARM_DOT((uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), (uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), acc.s3); \
- ARM_DOT((uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), (uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), acc.s3); \
- acc.s3 += val8.s3 * w8.s3; \
+#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \
+ ({ \
+ ARM_DOT((uchar4)(val0, val1, val2, val3), w0.s0123, acc); \
+ ARM_DOT((uchar4)(val4, val5, val6, val7), w0.s4567, acc); \
+ acc += val8 * w1; \
})
-#if WEIGHTS_OFFSET != 0
-#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
- ({ \
- ARM_DOT((uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), (uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), acc.s0); \
- ARM_DOT((uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), (uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), acc.s0); \
- ARM_DOT((uchar4)(w8.s0, 0, 0, 0), (uchar4)val8.s0, acc.s0); \
- \
- ARM_DOT((uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), (uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), acc.s1); \
- ARM_DOT((uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), (uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), acc.s1); \
- ARM_DOT((uchar4)(w8.s1, 0, 0, 0), (uchar4)val8.s1, acc.s1); \
- \
- ARM_DOT((uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), (uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), acc.s2); \
- ARM_DOT((uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), (uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), acc.s2); \
- ARM_DOT((uchar4)(w8.s2, 0, 0, 0), (uchar4)val8.s2, acc.s2); \
- \
- ARM_DOT((uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), (uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), acc.s3); \
- ARM_DOT((uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), (uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), acc.s3); \
- ARM_DOT((uchar4)(w8.s3, 0, 0, 0), (uchar4)val8.s3, acc.s3); \
- })
-#else /* WEIGHTS_OFFSET != 0 */
-#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)
-#endif /* WEIGHTS_OFFSET != 0 */
-
#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
({ \
- sum = CONVERT(val0, VEC_INT); \
- ARM_DOT((uchar4)(val1.s0, val2.s0, val3.s0, val4.s0), (uchar4)1, sum.s0); \
- ARM_DOT((uchar4)(val5.s0, val6.s0, val7.s0, val8.s0), (uchar4)1, sum.s0); \
- \
- ARM_DOT((uchar4)(val1.s1, val2.s1, val3.s1, val4.s1), (uchar4)1, sum.s1); \
- ARM_DOT((uchar4)(val5.s1, val6.s1, val7.s1, val8.s1), (uchar4)1, sum.s1); \
- \
- ARM_DOT((uchar4)(val1.s2, val2.s2, val3.s2, val4.s2), (uchar4)1, sum.s2); \
- ARM_DOT((uchar4)(val5.s2, val6.s2, val7.s2, val8.s2), (uchar4)1, sum.s2); \
- \
- ARM_DOT((uchar4)(val1.s3, val2.s3, val3.s3, val4.s3), (uchar4)1, sum.s3); \
- ARM_DOT((uchar4)(val5.s3, val6.s3, val7.s3, val8.s3), (uchar4)1, sum.s3); \
+ sum = val0; \
+ ARM_DOT((uchar4)(val1, val2, val3, val4), (uchar4)1, sum); \
+ ARM_DOT((uchar4)(val5, val6, val7, val8), (uchar4)1, sum); \
+ })
+
+#define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \
+ ({ \
+ sum = w1; \
+ ARM_DOT(w0.s0123, (uchar4)1, sum); \
+ ARM_DOT(w0.s4567, (uchar4)1, sum); \
})
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
@@ -637,6 +600,7 @@
#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
*
+ * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
@@ -664,13 +628,11 @@
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor reshaped. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
@@ -681,7 +643,7 @@
__kernel void depthwise_convolution_3x3_quantized_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
- TENSOR3D_DECLARATION(weights),
+ IMAGE_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
#endif /* defined(HAS_BIAS) */
@@ -692,11 +654,11 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
- Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+ __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
#if defined(DST_DEPTH)
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
@@ -716,19 +678,19 @@
int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
- // We compute 4x1x1 [C,W,H] elements
+ // We compute VEC_SIZEx1x1 [C,W,H] elements
VEC_INT acc = 0, sum = 0;
// Load weights
- VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
- VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
- VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
- VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
- VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
- VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
- VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
- VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
- VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
+ VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
+ VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
+ VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
+ VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
+ VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
+ VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
+ VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
+ VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
+ VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
#if INPUT_OFFSET != 0
VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -824,8 +786,9 @@
#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1
+/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
*
+ * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
@@ -858,8 +821,6 @@
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
@@ -871,7 +832,7 @@
__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
- TENSOR3D_DECLARATION(weights),
+ IMAGE_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
#endif /* defined(HAS_BIAS) */
@@ -882,11 +843,11 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
- Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+ __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
#if defined(DST_DEPTH)
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
@@ -913,15 +874,15 @@
VEC_INT acc3 = 0, sum3 = 0;
// Load weights
- VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
- VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
- VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
- VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
- VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
- VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
- VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
- VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
- VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
+ VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
+ VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
+ VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
+ VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
+ VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
+ VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
+ VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
+ VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
+ VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
#if INPUT_OFFSET != 0
VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -1103,9 +1064,11 @@
}
}
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4
+/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product.
*
+ * @note This kernel assumes VEC_SIZE is 4.
+ * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
@@ -1140,8 +1103,6 @@
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
@@ -1149,11 +1110,10 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
* @param[in] max_offset The maximum allowed offset for the input tensor
*/
-
__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
- TENSOR3D_DECLARATION(weights),
+ IMAGE_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
#endif // defined(HAS_BIAS)
@@ -1164,11 +1124,11 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else /* defined(DST_DEPTH) */
+#else // defined(DST_DEPTH)
int z = get_global_id(2); // spatial coordinate y
-#endif /* defined(DST_DEPTH) */
+#endif // defined(DST_DEPTH)
- Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+ __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
#if defined(DST_DEPTH)
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
@@ -1195,19 +1155,16 @@
VEC_INT sum1 = 0;
// Load weights
- VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
- VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
- VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
- VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
- VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
- VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
- VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
- VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
- VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
+ uchar16 w0 = VLOAD(16)(0, weights_addr);
+ uchar16 w1 = VLOAD(16)(0, weights_addr + 16);
+ uchar4 w2 = VLOAD(4)(0, weights_addr + 32);
#if INPUT_OFFSET != 0
// Initilize the final result with the weights reduction multiplied by INPUT_OFFSET
- DOT_PRODUCT_REDUCTION(acc0, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+ DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8);
+ DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
+ DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA);
+ DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
// Multiply the weights reduction with INPUT_OFFSET
acc0 = INPUT_OFFSET * acc0;
@@ -1250,11 +1207,25 @@
VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
- DOT_PRODUCT_REDUCTION(sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10);
- DOT_PRODUCT_ACCUMULATE(acc0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+ DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0);
+ DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0);
+ DOT_PRODUCT(acc0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0, w0.s01234567, w0.s8);
+ DOT_PRODUCT(acc1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0, w0.s01234567, w0.s8);
- DOT_PRODUCT_REDUCTION(sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11);
- DOT_PRODUCT_ACCUMULATE(acc1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+ DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1);
+ DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1);
+ DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
+ DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
+
+ DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2);
+ DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2);
+ DOT_PRODUCT(acc0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2, w1.s23456789, w1.sA);
+ DOT_PRODUCT(acc1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2, w1.s23456789, w1.sA);
+
+ DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3);
+ DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3);
+ DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
+ DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
#if defined(HAS_BIAS)
Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
@@ -1308,8 +1279,7 @@
VSTORE(VEC_SIZE)
(ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
}
-
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4
#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/elementwise_operation.cl
new file mode 100644
index 0000000..00d7ed3
--- /dev/null
+++ b/src/core/CL/cl_kernels/elementwise_operation.cl
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** List of all the operations supported by this kernel.
+ * @note ADD and SUB operations, when executed on integers, support saturation */
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+#define MAX(x, y) max(x, y)
+#define MIN(x, y) min(x, y)
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define DIV(x, y) (x / y)
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(OP) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ *
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load values
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+ in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+ in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+
+ // Calculate and store result
+ VSTORE(VEC_SIZE)
+ (OP(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
new file mode 100644
index 0000000..1f0533b
--- /dev/null
+++ b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define SUB(x, y) (x - y)
+#define ADD(x, y) (x + y)
+#define MAX(x, y) max((x), (y))
+#define MIN(x, y) min((x), (y))
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define DIV(x, y) (x / y)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(OP) && defined(VEC_SIZE) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @attention The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @attention The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @attention The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @attention The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @attention The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @attention The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ *
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
+ VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
+
+ in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+ in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+ const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+ const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+ const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+ const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/elementwise_unary.cl
new file mode 100644
index 0000000..92db569
--- /dev/null
+++ b/src/core/CL/cl_kernels/elementwise_unary.cl
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+/** Calculate reverse square root
+ *
+ * @param[in] input Pointer to the first element.
+ *
+ * @return reverse square root
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) inverse_sqrt(const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) input)
+{
+ return rsqrt(input);
+}
+
+/** Calculate exponential
+ *
+ * @param[in] input Pointer to the first element.
+ *
+ * @return exponential
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) exponential(const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) input)
+{
+ return exp(input);
+}
+#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+/** Calculate reverse square root
+ *
+ * @param[in] input Single element.
+ *
+ * @return reverse square root
+ */
+inline DATA_TYPE inverse_sqrt(const DATA_TYPE input)
+{
+ return rsqrt(input);
+}
+
+/** Calculate exponential
+ *
+ * @param[in] input Single element.
+ *
+ * @return exponential
+ */
+inline DATA_TYPE exponential(const DATA_TYPE input)
+{
+ return exp(input);
+}
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: F16/32.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: F16/32.
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary(
+ VECTOR_DECLARATION(in),
+ VECTOR_DECLARATION(out))
+{
+ Vector in = CONVERT_TO_VECTOR_STRUCT(in);
+ Vector out = CONVERT_TO_VECTOR_STRUCT(out);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+ out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+ VSTORE(VEC_SIZE)
+ (OPERATION(data), 0, (__global DATA_TYPE *)out.ptr);
+#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE *)(out.ptr)) = (DATA_TYPE)(OPERATION(*((__global DATA_TYPE *)in.ptr)));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/gather.cl
new file mode 100644
index 0000000..d6fe52d
--- /dev/null
+++ b/src/core/CL/cl_kernels/gather.cl
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS)
+
+/** Performs the Gather operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in] input_offset_first_element_in_bytes Offset of the first element in the source tensor
+ * @param[in] indices_ptr Pointer to the indices vector. Supported data types: S32/U32.
+ * @param[in] indices_stride_x Stride of the indices vector in X dimension (in bytes)
+ * @param[in] indices_step_x input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the indices vector
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in] output_offset_first_element_in_bytes Offset of the first element in the destination tensor
+ */
+__kernel void gather(
+ TENSOR4D_DECLARATION(input),
+ VECTOR_DECLARATION(indices),
+ TENSOR4D_DECLARATION(output))
+{
+ const int px = get_global_id(0);
+ const int py = get_global_id(1);
+ const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+ const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+ const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
+ const Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+ Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+ const uint index = *(__global const uint *)vector_offset(&indices, px);
+ __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif AXIS == 1
+ const uint index = *(__global const uint *)vector_offset(&indices, py);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif AXIS == 2
+ const uint index = *(__global const uint *)vector_offset(&indices, pz);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif AXIS == 3
+ const uint index = *(__global const uint *)vector_offset(&indices, pw);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif //AXIS
+
+ *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
+}
+
+#endif //defined(DATA_TYPE) && defined(AXIS)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 7de15d0..4736f80 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,1576 @@
* SOFTWARE.
*/
#include "helpers.h"
+#include "repeat.h"
+
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
+#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
+#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
+#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
+#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define CONCAT_INC(K0) INC##K0
+#define INC(K0) CONCAT_INC(K0)
+
+#if(SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) \
+ ({ \
+ a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
+ })
+#else // (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) \
+ ({})
+#endif // (SRC_WIDTH % K0)
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
+ * the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (i.e. -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (i.e. -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ * M0: 2,3,4,5,6,7,8
+ * K0: 2,3,4,8,16
+ * V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (i.e. input of convolution layer 1x1), the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+ )
+{
+ // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+ // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+ // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+ // Compute source and destination addresses
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+ // ------------------ Compute input/output addresses ---------------------------
+
+ // Compute the input address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+ // Compute the output address
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
+ (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+ // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src_stride_z by DEPTH_GEMM3D
+
+ // Note for the REINTERPRET_INPUT_AS_3D case
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zin0 = (0 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+ zin0 *= (cross_plane_pad * src_stride_y);
+#if M0 > 1
+ zin1 = (1 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+ zin1 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zin2 = (2 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+ zin2 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zin3 = (3 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+ zin3 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zin4 = (4 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+ zin4 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zin5 = (5 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+ zin5 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zin6 = (6 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+ zin6 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zin7 = (7 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin7 = min((uint)(DEPTH_GEMM3D - 1), zin7);
+ zin7 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 7
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ output_ptr += z * (uint)dst_stride_z;
+
+ // ---------------------------Load input values --------------------------------
+
+ // Load values from the LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin0));
+ BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin1));
+ BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin2));
+ BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin3));
+ BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y + zin4));
+ BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y + zin5));
+ BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y + zin6));
+ BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y + zin7));
+ BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+
+ // ---------------------------Store output values ------------------------------
+
+ VSTORE(K0)
+ (a0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#if M0 > 1
+ VSTORE(K0)
+ (a1, 0, (__global DATA_TYPE *)(output_ptr + 1 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 1
+#if M0 > 2
+ VSTORE(K0)
+ (a2, 0, (__global DATA_TYPE *)(output_ptr + 2 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 2
+#if M0 > 3
+ VSTORE(K0)
+ (a3, 0, (__global DATA_TYPE *)(output_ptr + 3 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 3
+#if M0 > 4
+ VSTORE(K0)
+ (a4, 0, (__global DATA_TYPE *)(output_ptr + 4 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 4
+#if M0 > 5
+ VSTORE(K0)
+ (a5, 0, (__global DATA_TYPE *)(output_ptr + 5 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 5
+#if M0 > 6
+ VSTORE(K0)
+ (a6, 0, (__global DATA_TYPE *)(output_ptr + 6 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 6
+#if M0 > 7
+ VSTORE(K0)
+ (a7, 0, (__global DATA_TYPE *)(output_ptr + 7 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 7
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if M0 == 2
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, M0) \
+ res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \
+ VSTORE(M0) \
+ (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ })
+#elif M0 == 3 // M0 == 3
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, M0) \
+ res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \
+ VSTORE(M0) \
+ (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ })
+#elif M0 == 4 // M0 == 4
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, M0) \
+ res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
+ VSTORE(M0) \
+ (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ })
+#elif M0 == 5 // M0 == 5
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
+ DATA_TYPE res1 = a4.s##i; \
+ VSTORE(4) \
+ (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
+ })
+#elif M0 == 6 // M0 == 6
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
+ VEC_DATA_TYPE(DATA_TYPE, 2) \
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \
+ VSTORE(4) \
+ (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ VSTORE(2) \
+ (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+ })
+#elif M0 == 7 // M0 == 7
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
+ VEC_DATA_TYPE(DATA_TYPE, 3) \
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \
+ VSTORE(4) \
+ (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ VSTORE(3) \
+ (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+ })
+#elif M0 == 8 // M0 == 8
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, M0) \
+ res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
+ VSTORE(M0) \
+ (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ })
+#else // M0 not supported
+#error "M0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
+ * the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (i.e. -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (i.e. -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ * M0: 2,3,4,5,6,7,8
+ * K0: 2,3,4,8,16
+ * V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (i.e. input of convolution layer 1x1), the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+ )
+{
+ // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+ // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+ // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+ // Compute source and destination addresses
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+ // ------------------ Compute input/output addresses ---------------------------
+
+ // Compute the input address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+ // Compute the output address
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
+ (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+ // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src_stride_z by DEPTH_GEMM3D
+
+ // Note for the REINTERPRET_INPUT_AS_3D case
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zin0 = (0 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+ zin0 *= (cross_plane_pad * src_stride_y);
+#if M0 > 1
+ zin1 = (1 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+ zin1 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zin2 = (2 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+ zin2 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zin3 = (3 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+ zin3 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zin4 = (4 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+ zin4 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zin5 = (5 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+ zin5 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zin6 = (6 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+ zin6 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zin7 = (7 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
+ zin7 = min((uint)(DEPTH_GEMM3D - 1), zin7);
+ zin7 *= (cross_plane_pad * src_stride_y);
+#endif // M0 > 7
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ output_ptr += z * (uint)dst_stride_z;
+
+ // ---------------------------Load input values --------------------------------
+
+ // Load values from the LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin0));
+ BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin1));
+ BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin2));
+ BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin3));
+ BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y + zin4));
+ BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y + zin5));
+ BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y + zin6));
+ BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y + zin7));
+ BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+
+ // ---------------------------Transpose and store block -----------------------
+
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
+#if K0 > 2
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
+#endif // K0 > 2
+#if K0 > 3
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
+#endif // K0 > 3
+#if K0 > 4
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
+#endif // K0 > 4
+#if K0 > 8
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
+#endif // K0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+
+#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
+ * the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (i.e. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (i.e. -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ * N0: 2,3,4,8,16
+ * K0: 1,2,3,4,8,16
+ * H0: greater than 0
+ *
+ * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+ // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+ // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+ // Compute source and destination addresses
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+ // ------------------ Compute input/output addresses ---------------------------
+
+ // Compute the input address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
+
+ // Compute the output address
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
+ x / (uint)H0)
+ * (uint)dst_stride_y)
+ + z * (uint)dst_stride_z;
+
+ // ---------------------------Load input values --------------------------------
+
+ REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
+
+ // Load values from the RHS matrix
+ a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+#if K0 > 1
+ if(y * (uint)K0 + 1 < SRC_HEIGHT)
+ {
+ a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+ }
+#endif // K0 > 1
+#if K0 > 2
+ if(y * (uint)K0 + 2 < SRC_HEIGHT)
+ {
+ a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+ }
+#endif // K0 > 2
+#if K0 > 3
+ if(y * (uint)K0 + 3 < SRC_HEIGHT)
+ {
+ a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+ }
+#endif // K0 > 3
+#if K0 > 4
+ if(y * (uint)K0 + 4 < SRC_HEIGHT)
+ {
+ a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+ }
+ if(y * (uint)K0 + 5 < SRC_HEIGHT)
+ {
+ a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+ }
+ if(y * (uint)K0 + 6 < SRC_HEIGHT)
+ {
+ a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+ }
+ if(y * (uint)K0 + 7 < SRC_HEIGHT)
+ {
+ a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+ }
+#endif // K0 > 4
+#if K0 > 8
+ if(y * (uint)K0 + 8 < SRC_HEIGHT)
+ {
+ a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+ }
+ if(y * (uint)K0 + 9 < SRC_HEIGHT)
+ {
+ a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+ }
+ if(y * (uint)K0 + 10 < SRC_HEIGHT)
+ {
+ aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+ }
+ if(y * (uint)K0 + 11 < SRC_HEIGHT)
+ {
+ aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+ }
+ if(y * (uint)K0 + 12 < SRC_HEIGHT)
+ {
+ aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+ }
+ if(y * (uint)K0 + 13 < SRC_HEIGHT)
+ {
+ aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+ }
+ if(y * (uint)K0 + 14 < SRC_HEIGHT)
+ {
+ aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+ }
+ if(y * (uint)K0 + 15 < SRC_HEIGHT)
+ {
+ aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+ }
+#endif // K0 > 8
+
+ // ---------------------------Store output values ------------------------------
+ VSTORE(N0)
+ (a0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#if K0 > 1
+ VSTORE(N0)
+ (a1, 0, (__global DATA_TYPE *)(output_ptr + 1 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // K0 > 1
+#if K0 > 2
+ VSTORE(N0)
+ (a2, 0, (__global DATA_TYPE *)(output_ptr + 2 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // K0 > 2
+#if K0 > 3
+ VSTORE(N0)
+ (a3, 0, (__global DATA_TYPE *)(output_ptr + 3 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // K0 > 3
+#if K0 > 4
+ VSTORE(N0)
+ (a4, 0, (__global DATA_TYPE *)(output_ptr + 4 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (a5, 0, (__global DATA_TYPE *)(output_ptr + 5 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (a6, 0, (__global DATA_TYPE *)(output_ptr + 6 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (a7, 0, (__global DATA_TYPE *)(output_ptr + 7 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if K0 > 8
+ VSTORE(N0)
+ (a8, 0, (__global DATA_TYPE *)(output_ptr + 8 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (a9, 0, (__global DATA_TYPE *)(output_ptr + 9 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (aA, 0, (__global DATA_TYPE *)(output_ptr + 10 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (aB, 0, (__global DATA_TYPE *)(output_ptr + 11 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (aC, 0, (__global DATA_TYPE *)(output_ptr + 12 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (aD, 0, (__global DATA_TYPE *)(output_ptr + 13 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (aE, 0, (__global DATA_TYPE *)(output_ptr + 14 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(N0)
+ (aF, 0, (__global DATA_TYPE *)(output_ptr + 15 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if defined(TRANSPOSE)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
+ * the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (i.e. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (i.e. -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ * N0: 2,3,4,8,16
+ * K0: 2,3,4,8,16
+ * H0: greater than 0
+ *
+ * @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+ // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+ // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+ // Compute source and destination addresses
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+ // ------------------ Compute input/output addresses ---------------------------
+
+ // Compute the input address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
+
+ // Compute the output address
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
+ (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+ // ---------------------------Load input values --------------------------------
+ REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;
+
+ // Load values from the RHS matrix
+ a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ if(y * (uint)K0 + 1 < SRC_HEIGHT)
+ {
+ a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+ }
+#if K0 > 2
+ if(y * (uint)K0 + 2 < SRC_HEIGHT)
+ {
+ a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+ }
+#endif // K0 > 2
+#if K0 > 3
+ if(y * (uint)K0 + 3 < SRC_HEIGHT)
+ {
+ a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+ }
+#endif // K0 > 3
+#if K0 > 4
+ if(y * (uint)K0 + 4 < SRC_HEIGHT)
+ {
+ a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+ }
+ if(y * (uint)K0 + 5 < SRC_HEIGHT)
+ {
+ a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+ }
+ if(y * (uint)K0 + 6 < SRC_HEIGHT)
+ {
+ a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+ }
+ if(y * (uint)K0 + 7 < SRC_HEIGHT)
+ {
+ a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+ }
+#endif // K0 > 4
+#if K0 > 8
+ if(y * (uint)K0 + 8 < SRC_HEIGHT)
+ {
+ a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+ }
+ if(y * (uint)K0 + 9 < SRC_HEIGHT)
+ {
+ a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+ }
+ if(y * (uint)K0 + 10 < SRC_HEIGHT)
+ {
+ aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+ }
+ if(y * (uint)K0 + 11 < SRC_HEIGHT)
+ {
+ aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+ }
+ if(y * (uint)K0 + 12 < SRC_HEIGHT)
+ {
+ aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+ }
+ if(y * (uint)K0 + 13 < SRC_HEIGHT)
+ {
+ aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+ }
+ if(y * (uint)K0 + 14 < SRC_HEIGHT)
+ {
+ aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+ }
+ if(y * (uint)K0 + 15 < SRC_HEIGHT)
+ {
+ aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+ }
+#endif // K0 > 8
+
+ // ---------------------------Transpose the block ------------------------------
+ REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;
+
+#if K0 == 2
+ // This part computes the following transpositions:
+ // 2x2 -> 2x2
+ // 2x4 -> 4x2
+ // 2x8 -> 8x2
+ // 2x16 -> 16x2
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
+#endif // N0 > 8
+
+#elif K0 == 3 // K0 == 2
+ // This part computes the following transpositions:
+ // 3x2 -> 2x3
+ // 3x4 -> 4x3
+ // 3x8 -> 8x3
+ // 3x16 -> 16x3
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
+#endif // N0 > 8
+
+#elif K0 == 4 // K0 == 4
+ // This part computes the following transpositions:
+ // 4x2 -> 2x4
+ // 4x4 -> 4x4
+ // 4x8 -> 8x4
+ // 4x16 -> 16x4
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
+#endif // N0 > 8
+
+#elif K0 == 8 // K0 == 8
+ // This part computes the following transpositions:
+ // 8x2 -> 2x8
+ // 8x4 -> 4x8
+ // 8x8 -> 8x8
+ // 8x16 -> 16x8
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
+#endif // N0 > 8
+
+#elif K0 == 16 // K0 == 16
+
+ // This part computes the following transpositions:
+ // 16x2 -> 2x16
+ // 16x4 -> 4x16
+ // 16x8 -> 8x16
+ // 16x16 -> 16x16
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
+ a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
+ a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
+ a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
+ a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
+ a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
+ a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
+ a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
+ a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
+ a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
+ a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
+ a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
+ a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
+ a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
+ a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
+ a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
+ a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
+#endif // N0 > 8
+
+#else // N0 == 16
+#error "Not supported N0 value"
+#endif // N0 > 2
+
+ // ---------------------------Store the output values ------------------------------
+
+ VSTORE(K0)
+ (res0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (res1, 0, (__global DATA_TYPE *)(output_ptr + 1 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#if N0 > 2
+ VSTORE(K0)
+ (res2, 0, (__global DATA_TYPE *)(output_ptr + 2 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 2
+#if N0 > 3
+ VSTORE(K0)
+ (res3, 0, (__global DATA_TYPE *)(output_ptr + 3 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 3
+#if N0 > 4
+ VSTORE(K0)
+ (res4, 0, (__global DATA_TYPE *)(output_ptr + 4 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (res5, 0, (__global DATA_TYPE *)(output_ptr + 5 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (res6, 0, (__global DATA_TYPE *)(output_ptr + 6 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (res7, 0, (__global DATA_TYPE *)(output_ptr + 7 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if N0 > 8
+ VSTORE(K0)
+ (res8, 0, (__global DATA_TYPE *)(output_ptr + 8 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (res9, 0, (__global DATA_TYPE *)(output_ptr + 9 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (resA, 0, (__global DATA_TYPE *)(output_ptr + 10 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (resB, 0, (__global DATA_TYPE *)(output_ptr + 11 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (resC, 0, (__global DATA_TYPE *)(output_ptr + 12 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (resD, 0, (__global DATA_TYPE *)(output_ptr + 13 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (resE, 0, (__global DATA_TYPE *)(output_ptr + 14 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+ VSTORE(K0)
+ (resF, 0, (__global DATA_TYPE *)(output_ptr + 15 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(TRANSPOSE)
+#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE)
+
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ c = fma(a.s2, b.s2, c); \
+ })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ c = fma(a.s2, b.s2, c); \
+ c = fma(a.s3, b.s3, c); \
+ })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ c = fma(a.s2, b.s2, c); \
+ c = fma(a.s3, b.s3, c); \
+ c = fma(a.s4, b.s4, c); \
+ c = fma(a.s5, b.s5, c); \
+ c = fma(a.s6, b.s6, c); \
+ c = fma(a.s7, b.s7, c); \
+ })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ c = fma(a.s2, b.s2, c); \
+ c = fma(a.s3, b.s3, c); \
+ c = fma(a.s4, b.s4, c); \
+ c = fma(a.s5, b.s5, c); \
+ c = fma(a.s6, b.s6, c); \
+ c = fma(a.s7, b.s7, c); \
+ c = fma(a.s8, b.s8, c); \
+ c = fma(a.s9, b.s9, c); \
+ c = fma(a.sA, b.sA, c); \
+ c = fma(a.sB, b.sB, c); \
+ c = fma(a.sC, b.sC, c); \
+ c = fma(a.sD, b.sD, c); \
+ c = fma(a.sE, b.sE, c); \
+ c = fma(a.sF, b.sF, c); \
+ })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ ARM_DOT_K0((a), (b##3), (c.s3)); \
+ })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ ARM_DOT_K0((a), (b##3), (c.s3)); \
+ ARM_DOT_K0((a), (b##4), (c.s4)); \
+ ARM_DOT_K0((a), (b##5), (c.s5)); \
+ ARM_DOT_K0((a), (b##6), (c.s6)); \
+ ARM_DOT_K0((a), (b##7), (c.s7)); \
+ })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ ARM_DOT_K0((a), (b##3), (c.s3)); \
+ ARM_DOT_K0((a), (b##4), (c.s4)); \
+ ARM_DOT_K0((a), (b##5), (c.s5)); \
+ ARM_DOT_K0((a), (b##6), (c.s6)); \
+ ARM_DOT_K0((a), (b##7), (c.s7)); \
+ ARM_DOT_K0((a), (b##8), (c.s8)); \
+ ARM_DOT_K0((a), (b##9), (c.s9)); \
+ ARM_DOT_K0((a), (b##A), (c.sA)); \
+ ARM_DOT_K0((a), (b##B), (c.sB)); \
+ ARM_DOT_K0((a), (b##C), (c.sC)); \
+ ARM_DOT_K0((a), (b##D), (c.sD)); \
+ ARM_DOT_K0((a), (b##E), (c.sE)); \
+ ARM_DOT_K0((a), (b##F), (c.sF)); \
+ })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint k,
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ // Compute LHS matrix address
+ __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+ (get_global_id(2) * lhs_stride_z);
+
+ // Compute RHS matrix address
+ __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+ for(int i = 0; i < k; i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 2,4 - 2,8 - 2,16
+ // 3,4 - 3,8 - 3,16
+ // 4,4 - 4,8 - 4,16
+ // 5,4 - 5,8 - 5,16
+ // 6,4 - 6,8 - 6,16
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 0 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 1 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 2 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 3 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 4 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 5 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 6 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 7 * LHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // M0 > 7
+
+ // Load values from RHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b0 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b1 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#if N0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b2 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 2
+#if N0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b3 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 3
+#if N0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b4 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b5 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b6 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b7 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if N0 > 8
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b8 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b9 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bA = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bB = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bC = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bD = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bE = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bF = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+ // Accumulate
+ ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+ lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+ rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zout0 = (0 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+ zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+ zout1 = (1 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+ zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zout2 = (2 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+ zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zout3 = (3 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+ zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zout4 = (4 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+ zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zout5 = (5 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+ zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zout6 = (6 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+ zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zout7 = (7 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+ zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ c0 = c0 * (DATA_TYPE)ALPHA;
+#if M0 > 1
+ c1 = c1 * (DATA_TYPE)ALPHA;
+#endif // M0 > 1
+#if M0 > 2
+ c2 = c2 * (DATA_TYPE)ALPHA;
+#endif // M0 > 2
+#if M0 > 3
+ c3 = c3 * (DATA_TYPE)ALPHA;
+#endif // M0 > 3
+#if M0 > 4
+ c4 = c4 * (DATA_TYPE)ALPHA;
+#endif // M0 > 4
+#if M0 > 5
+ c5 = c5 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 6
+ c6 = c6 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 7
+ c7 = c7 * (DATA_TYPE)ALPHA;
+#endif // M0 > 7
+#endif // defined(ALPHA)
+
+ // Store output block
+ VSTORE(N0)
+ (c0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+ VSTORE(N0)
+ (c1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+ VSTORE(N0)
+ (c2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+ VSTORE(N0)
+ (c3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+ VSTORE(N0)
+ (c4, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+ VSTORE(N0)
+ (c5, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+ VSTORE(N0)
+ (c6, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+ VSTORE(N0)
+ (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
@@ -193,7 +1763,7 @@
vstore4(a1, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
vstore4(a2, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
vstore4(a3, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
-#else // defined(UNROLL_BLOCK)
+#else // defined(UNROLL_BLOCK)
VEC_DATA_TYPE(DATA_TYPE, 4)
val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
@@ -214,6 +1784,8 @@
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -226,6 +1798,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -238,6 +1812,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -251,6 +1829,9 @@
*/
__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -340,6 +1921,16 @@
c30 = c30 * (float4)ALPHA;
#endif // defined(ALPHA)
+#if defined(ADD_VEC_C)
+ __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float4 c0 = vload4(0, src2_addr);
+
+ c00 += c0;
+ c10 += c0;
+ c20 += c0;
+ c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
@@ -389,7 +1980,9 @@
}
/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication.
+ *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
*
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
@@ -404,6 +1997,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -416,6 +2011,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -429,6 +2028,9 @@
*/
__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -653,6 +2255,28 @@
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
+#if defined(ADD_VEC_C)
+ __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float4 c0 = vload4(0, src2_addr);
+
+ c00 += c0.s0;
+ c01 += c0.s1;
+ c02 += c0.s2;
+ c03 += c0.s3;
+ c10 += c0.s0;
+ c11 += c0.s1;
+ c12 += c0.s2;
+ c13 += c0.s3;
+ c20 += c0.s0;
+ c21 += c0.s1;
+ c22 += c0.s2;
+ c23 += c0.s3;
+ c30 += c0.s0;
+ c31 += c0.s1;
+ c32 += c0.s2;
+ c33 += c0.s3;
+#endif /* defined(ADD_VEC_C) */
+
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
// in order to take into account the presence of possible cross plane paddings
@@ -705,6 +2329,8 @@
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -717,6 +2343,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -729,6 +2357,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -742,6 +2374,9 @@
*/
__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -831,6 +2466,20 @@
c30 = c30 * (half8)ALPHA;
#endif // defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ half8 c0 = vload8(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ c00 += c0;
+ c10 += c0;
+ c20 += c0;
+ c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
@@ -882,6 +2531,8 @@
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) while accumulating the result in a 32 floating point variable.
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -894,6 +2545,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -906,6 +2559,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -919,6 +2576,9 @@
*/
__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -1008,6 +2668,20 @@
c30 = c30 * (float8)ALPHA;
#endif // defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float8 c0 = convert_float8(vload8(0, src2_addr));
+ // clang-format on
+ // *INDENT-ON*
+
+ c00 += c0;
+ c10 += c0;
+ c20 += c0;
+ c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
@@ -1059,6 +2733,8 @@
/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -1071,6 +2747,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1083,6 +2761,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1093,6 +2775,9 @@
*/
__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -1264,6 +2949,20 @@
c30 = c30 * (half8)ALPHA;
#endif // defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ half8 c0 = vload8(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ c00 += c0;
+ c10 += c0;
+ c20 += c0;
+ c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
@@ -1322,7 +3021,9 @@
#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
#if defined(DATA_TYPE)
#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
*
* @note This OpenCL kernel works with floating point data types (F16/F32)
* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -1338,6 +3039,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1350,6 +3053,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1364,6 +3071,9 @@
*/
__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -1564,6 +3274,26 @@
acc3 = acc3 * (VECTOR_TYPE)ALPHA;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global DATA_TYPE *src2_addr = (__global DATA_TYPE *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ VECTOR_TYPE c0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ acc0 += c0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
int z = get_global_id(2);
#if defined(REINTERPRET_OUTPUT_AS_3D)
@@ -1634,6 +3364,8 @@
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
@@ -1649,6 +3381,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1661,6 +3395,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1675,6 +3413,9 @@
*/
__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -2029,6 +3770,34 @@
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
+#if defined(ADD_VEC_C)
+ __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float4 c0 = vload4(0, src2_addr);
+
+ acc00 += c0.s0;
+ acc01 += c0.s1;
+ acc02 += c0.s2;
+ acc03 += c0.s3;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc10 += c0.s0;
+ acc11 += c0.s1;
+ acc12 += c0.s2;
+ acc13 += c0.s3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc20 += c0.s0;
+ acc21 += c0.s1;
+ acc22 += c0.s2;
+ acc23 += c0.s3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc30 += c0.s0;
+ acc31 += c0.s1;
+ acc32 += c0.s2;
+ acc33 += c0.s3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
// in order to take into account the presence of possible cross plane paddings
@@ -2088,6 +3857,8 @@
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
@@ -2104,6 +3875,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2116,6 +3889,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2130,6 +3907,9 @@
*/
__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -2416,6 +4196,26 @@
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
+#if defined(ADD_VEC_C)
+ __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float2 c0 = vload2(0, src2_addr);
+
+ acc00 += c0.s0;
+ acc01 += c0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc10 += c0.s0;
+ acc11 += c0.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc20 += c0.s0;
+ acc21 += c0.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc30 += c0.s0;
+ acc31 += c0.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
// in order to take into account the presence of possible cross plane paddings
@@ -2476,6 +4276,8 @@
#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
@@ -2491,6 +4293,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2503,6 +4307,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2517,6 +4325,9 @@
*/
__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -2757,6 +4568,26 @@
#endif // defined(ALPHA)
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ half8 c0 = vload8(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ hacc0 += c0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ hacc1 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ hacc2 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ hacc3 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
int z = get_global_id(2);
// Compute destination address
@@ -2824,6 +4655,8 @@
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
@@ -2839,6 +4672,8 @@
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2851,6 +4686,10 @@
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2865,6 +4704,9 @@
*/
__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -3089,6 +4931,26 @@
acc3 = acc3 * (half8)ALPHA;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ half8 c0 = vload8(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ acc0 += c0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
int z = get_global_id(2);
// Compute destination address
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 8c1fa54..277338b 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "helpers.h"
#include "helpers_asymm.h"
+#include "repeat.h"
#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
@@ -1943,6 +1944,574 @@
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ ARM_DOT((uchar4)(a, (uchar2)0), (uchar4)(b, (uchar2)0), c); \
+ })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ ARM_DOT((uchar4)(a, (uchar)0), (uchar4)(b, (uchar)0), c); \
+ })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ ARM_DOT(a, b, c); \
+ })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ ARM_DOT(a.s0123, b.s0123, c); \
+ ARM_DOT(a.s4567, b.s4567, c); \
+ })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ ARM_DOT(a.s0123, b.s0123, c); \
+ ARM_DOT(a.s4567, b.s4567, c); \
+ ARM_DOT(a.s89AB, b.s89AB, c); \
+ ARM_DOT(a.sCDEF, b.sCDEF, c); \
+ })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += (uint)a.s0 * b.s0; \
+ c += (uint)a.s1 * b.s1; \
+ })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += (uint)a.s0 * b.s0; \
+ c += (uint)a.s1 * b.s1; \
+ c += (uint)a.s2 * b.s2; \
+ })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += (uint)a.s0 * b.s0; \
+ c += (uint)a.s1 * b.s1; \
+ c += (uint)a.s2 * b.s2; \
+ c += (uint)a.s3 * b.s3; \
+ })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += (uint)a.s0 * b.s0; \
+ c += (uint)a.s1 * b.s1; \
+ c += (uint)a.s2 * b.s2; \
+ c += (uint)a.s3 * b.s3; \
+ c += (uint)a.s4 * b.s4; \
+ c += (uint)a.s5 * b.s5; \
+ c += (uint)a.s6 * b.s6; \
+ c += (uint)a.s7 * b.s7; \
+ })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += (uint)a.s0 * b.s0; \
+ c += (uint)a.s1 * b.s1; \
+ c += (uint)a.s2 * b.s2; \
+ c += (uint)a.s3 * b.s3; \
+ c += (uint)a.s4 * b.s4; \
+ c += (uint)a.s5 * b.s5; \
+ c += (uint)a.s6 * b.s6; \
+ c += (uint)a.s7 * b.s7; \
+ c += (uint)a.s8 * b.s8; \
+ c += (uint)a.s9 * b.s9; \
+ c += (uint)a.sA * b.sA; \
+ c += (uint)a.sB * b.sB; \
+ c += (uint)a.sC * b.sC; \
+ c += (uint)a.sD * b.sD; \
+ c += (uint)a.sE * b.sE; \
+ c += (uint)a.sF * b.sF; \
+ })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0
+
+#endif //defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ ARM_DOT_K0((a), (b##3), (c.s3)); \
+ })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ ARM_DOT_K0((a), (b##3), (c.s3)); \
+ ARM_DOT_K0((a), (b##4), (c.s4)); \
+ ARM_DOT_K0((a), (b##5), (c.s5)); \
+ ARM_DOT_K0((a), (b##6), (c.s6)); \
+ ARM_DOT_K0((a), (b##7), (c.s7)); \
+ })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ ARM_DOT_K0((a), (b##3), (c.s3)); \
+ ARM_DOT_K0((a), (b##4), (c.s4)); \
+ ARM_DOT_K0((a), (b##5), (c.s5)); \
+ ARM_DOT_K0((a), (b##6), (c.s6)); \
+ ARM_DOT_K0((a), (b##7), (c.s7)); \
+ ARM_DOT_K0((a), (b##8), (c.s8)); \
+ ARM_DOT_K0((a), (b##9), (c.s9)); \
+ ARM_DOT_K0((a), (b##A), (c.sA)); \
+ ARM_DOT_K0((a), (b##B), (c.sB)); \
+ ARM_DOT_K0((a), (b##C), (c.sC)); \
+ ARM_DOT_K0((a), (b##D), (c.sD)); \
+ ARM_DOT_K0((a), (b##E), (c.sE)); \
+ ARM_DOT_K0((a), (b##F), (c.sF)); \
+ })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint k,
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ // Compute LHS matrix address
+ __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X + (get_global_id(1) / V0) * (uint)lhs_stride_y + (get_global_id(
+ 2)
+ * lhs_stride_z);
+
+ // Compute RHS matrix address
+ __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X + (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+ for(int i = 0; i < k; i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 2,4 - 2,8 - 2,16
+ // 3,4 - 3,8 - 3,16
+ // 4,4 - 4,8 - 4,16
+ // 5,4 - 5,8 - 5,16
+ // 6,4 - 6,8 - 6,16
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(uchar, K0)
+ a0 = VLOAD(K0)(0, lhs_addr + 0 * LHS_STEP_X);
+#if M0 > 1
+ VEC_DATA_TYPE(uchar, K0)
+ a1 = VLOAD(K0)(0, lhs_addr + 1 * LHS_STEP_X);
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(uchar, K0)
+ a2 = VLOAD(K0)(0, lhs_addr + 2 * LHS_STEP_X);
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(uchar, K0)
+ a3 = VLOAD(K0)(0, lhs_addr + 3 * LHS_STEP_X);
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(uchar, K0)
+ a4 = VLOAD(K0)(0, lhs_addr + 4 * LHS_STEP_X);
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(uchar, K0)
+ a5 = VLOAD(K0)(0, lhs_addr + 5 * LHS_STEP_X);
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(uchar, K0)
+ a6 = VLOAD(K0)(0, lhs_addr + 6 * LHS_STEP_X);
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(uchar, K0)
+ a7 = VLOAD(K0)(0, lhs_addr + 7 * LHS_STEP_X);
+#endif // M0 > 7
+
+ // Load values from RHS matrix
+ VEC_DATA_TYPE(uchar, K0)
+ b0 = VLOAD(K0)(0, rhs_addr + 0 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b1 = VLOAD(K0)(0, rhs_addr + 1 * RHS_STEP_X);
+#if N0 > 2
+ VEC_DATA_TYPE(uchar, K0)
+ b2 = VLOAD(K0)(0, rhs_addr + 2 * RHS_STEP_X);
+#endif // N0 > 2
+#if N0 > 3
+ VEC_DATA_TYPE(uchar, K0)
+ b3 = VLOAD(K0)(0, rhs_addr + 3 * RHS_STEP_X);
+#endif // N0 > 3
+#if N0 > 4
+ VEC_DATA_TYPE(uchar, K0)
+ b4 = VLOAD(K0)(0, rhs_addr + 4 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b5 = VLOAD(K0)(0, rhs_addr + 5 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b6 = VLOAD(K0)(0, rhs_addr + 6 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b7 = VLOAD(K0)(0, rhs_addr + 7 * RHS_STEP_X);
+#endif // N0 > 4
+#if N0 > 8
+ VEC_DATA_TYPE(uchar, K0)
+ b8 = VLOAD(K0)(0, rhs_addr + 8 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b9 = VLOAD(K0)(0, rhs_addr + 9 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bA = VLOAD(K0)(0, rhs_addr + 10 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bB = VLOAD(K0)(0, rhs_addr + 11 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bC = VLOAD(K0)(0, rhs_addr + 12 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bD = VLOAD(K0)(0, rhs_addr + 13 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bE = VLOAD(K0)(0, rhs_addr + 14 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bF = VLOAD(K0)(0, rhs_addr + 15 * RHS_STEP_X);
+#endif // N0 > 8
+
+ // Accumulate
+ ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+ lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+ rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(int)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zout0 = (0 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+ zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+ zout1 = (1 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+ zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zout2 = (2 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+ zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zout3 = (3 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+ zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zout4 = (4 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+ zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zout5 = (5 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+ zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zout6 = (6 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+ zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zout7 = (7 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+ zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Store output block
+ VSTORE(N0)
+ (CONVERT_SAT(c0, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+ VSTORE(N0)
+ (CONVERT_SAT(c1, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+ VSTORE(N0)
+ (CONVERT_SAT(c2, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+ VSTORE(N0)
+ (CONVERT_SAT(c3, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+ VSTORE(N0)
+ (CONVERT_SAT(c4, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+ VSTORE(N0)
+ (CONVERT_SAT(c5, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+ VSTORE(N0)
+ (CONVERT_SAT(c6, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+ VSTORE(N0)
+ (CONVERT_SAT(c7, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices unsing the dot8 instruction.
+ * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t_dot8(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint k,
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Note: ARM_DOT_K0XN0 is generated with the dot8 instruction
+ gemmlowp_mm_reshaped_lhs_nt_rhs_t(lhs_ptr,
+ lhs_stride_x,
+ lhs_step_x,
+ lhs_stride_y,
+ lhs_step_y,
+ lhs_offset_first_element_in_bytes,
+ rhs_ptr,
+ rhs_stride_x,
+ rhs_step_x,
+ rhs_stride_y,
+ rhs_step_y,
+ rhs_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_offset_first_element_in_bytes,
+ k,
+ lhs_stride_z,
+ rhs_stride_z,
+ dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ );
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
+
#if defined(COLS_A)
/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
*
diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/generate_proposals.cl
index bc6f4b5..a947dad 100644
--- a/src/core/CL/cl_kernels/generate_proposals.cl
+++ b/src/core/CL/cl_kernels/generate_proposals.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 7ee97d9..180bd50 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -50,6 +50,9 @@
#define VSTORE_STR(size) vstore##size
#define VSTORE(size) VSTORE_STR(size)
+#define float1 float
+#define half1 half
+
#define VEC_DATA_TYPE_STR(type, size) type##size
#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 186d5a8..2bf59e4 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -1029,6 +1029,177 @@
#endif // HAS_BIAS
}
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i) \
+ ({ \
+ yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \
+ \
+ offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \
+ offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \
+ \
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \
+ \
+ int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT)); \
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s0)); \
+ values1 = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s1)); \
+ values2 = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s2)); \
+ values3 = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s3)); \
+ values4 = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s4)); \
+ values5 = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s5)); \
+ values6 = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s6)); \
+ values7 = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s7)); \
+ values8 = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond1)); \
+ \
+ VSTORE(VECTOR_SIZE) \
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + (0 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values1, 0, (__global DATA_TYPE *)(output_ptr) + (1 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values2, 0, (__global DATA_TYPE *)(output_ptr) + (2 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values3, 0, (__global DATA_TYPE *)(output_ptr) + (3 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values4, 0, (__global DATA_TYPE *)(output_ptr) + (4 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values5, 0, (__global DATA_TYPE *)(output_ptr) + (5 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values6, 0, (__global DATA_TYPE *)(output_ptr) + (6 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values7, 0, (__global DATA_TYPE *)(output_ptr) + (7 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values8, 0, (__global DATA_TYPE *)(output_ptr) + (8 + i * 9) * SRC_DEPTH); \
+ })
+#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i) \
+ ({ \
+ yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \
+ \
+ offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \
+ offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \
+ \
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \
+ \
+ VSTORE(VECTOR_SIZE) \
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + (0 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values1, 0, (__global DATA_TYPE *)(output_ptr) + (1 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values2, 0, (__global DATA_TYPE *)(output_ptr) + (2 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values3, 0, (__global DATA_TYPE *)(output_ptr) + (3 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values4, 0, (__global DATA_TYPE *)(output_ptr) + (4 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values5, 0, (__global DATA_TYPE *)(output_ptr) + (5 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values6, 0, (__global DATA_TYPE *)(output_ptr) + (6 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values7, 0, (__global DATA_TYPE *)(output_ptr) + (7 + i * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (values8, 0, (__global DATA_TYPE *)(output_ptr) + (8 + i * 9) * SRC_DEPTH); \
+ })
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col9x9_nhwc(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
+
+ // Calculate input indices
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int yi_coord = 0;
+ int8 offset0 = 0;
+ int offset1 = 0;
+
+ // Clamp xi
+ int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
+ int xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
+
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+ xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
+ xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+ xi_offset0 *= (int8)src_stride_y;
+ xi_offset1 *= (int)src_stride_y;
+
+ // Out-of-bound condition for X
+ int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
+ int x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+ IM2COL1x9(0);
+ IM2COL1x9(1);
+ IM2COL1x9(2);
+ IM2COL1x9(3);
+ IM2COL1x9(4);
+ IM2COL1x9(5);
+ IM2COL1x9(6);
+ IM2COL1x9(7);
+ IM2COL1x9(8);
+
+#ifdef HAS_BIAS
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+ {
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+
/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index 0b6df39..390f8fc 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -32,6 +32,7 @@
#define LOAD_OP(offset, ptr) vload4(offset, ptr)
#define STORE_OP(data, offset, ptr) vstore4(data, offset, ptr)
+#if defined(NUM_SLICES)
/** Apply cross-map normalization.
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
@@ -91,9 +92,10 @@
STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
}
+#endif /* defined(NUM_SLICES) */
#if defined(WIDTH_SIZE)
-/** Apply in-map normalization.
+/** Apply in-map normalization when tensors are in the NCHW data layout format.
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
@@ -117,8 +119,8 @@
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void normalization_layer_in_map(TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
+__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
{
Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
@@ -170,3 +172,83 @@
STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
}
#endif // defined(WIDTH_SIZE)
+
+#if defined(NUM_SLICES)
+/** Apply in-map normalization when tensors are in the NHWC data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+ const int current_cols = get_global_id(1);
+ const int first_col = max(-(int)RADIUS, -current_cols);
+ const int last_col = min((int)RADIUS, (int)get_global_size(1) - 1 - current_cols);
+
+#if defined(IN_MAP_2D)
+ const int current_rows = get_global_id(2);
+ const int first_row = max(-(int)RADIUS, -current_rows);
+ const int last_row = min((int)RADIUS, (int)NUM_SLICES - 1 - current_rows);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+ for(int j = first_row; j <= last_row; ++j)
+ {
+#endif /* defined(IN_MAP_2D) */
+ for(int i = first_col; i <= last_col; ++i)
+ {
+#if defined(IN_MAP_2D)
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, j));
+#else /* defined(IN_MAP_2D) */
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, 0));
+#endif /* defined(IN_MAP_2D) */
+ acc = ADD_OP(acc, MUL_OP(values, values));
+ }
+#if defined(IN_MAP_2D)
+ }
+#endif /* defined(IN_MAP_2D) */
+
+ acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized = POW_OP(acc, beta_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+ STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl
index 03fc15e..77f03f7 100644
--- a/src/core/CL/cl_kernels/permute.cl
+++ b/src/core/CL/cl_kernels/permute.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,11 +23,12 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(DEPTH_IN)
-/** Perform a DCHW -> DHWC permute operation on an input tensor.
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/**Perform a permute operation on an input tensor of Shape DCHW.
*
* @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
* @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
*
* @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
@@ -48,81 +49,26 @@
* @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
-__kernel void permute_201(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
+__kernel void permute(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+
{
Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
- *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
+ int out_index[4] = { 0 };
+ int in_index[4] = { 0 };
+
+ in_index[0] = get_global_id(0); // W
+ in_index[1] = get_global_id(1); // H
+ in_index[2] = get_global_id(2) % DEPTH_IN; // C
+ in_index[3] = get_global_id(2) / DEPTH_IN; // B
+
+ out_index[0] = in_index[P1];
+ out_index[1] = in_index[P2];
+ out_index[2] = in_index[P3];
+ out_index[3] = in_index[P4];
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
}
-
-/** Perform a DCHW -> DWCH permute operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_120(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, get_global_id(1), (get_global_id(2) % DEPTH_IN), get_global_id(0), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
-}
-
-/** Perform a DCHW -> HWCD permute operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_3201(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) / DEPTH_IN), (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1))) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN)
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/prior_box_layer.cl
index be072ec..046151b 100644
--- a/src/core/CL/cl_kernels/prior_box_layer.cl
+++ b/src/core/CL/cl_kernels/prior_box_layer.cl
@@ -104,88 +104,6 @@
return idx;
}
-
-/** Compute prior boxes and clip (NHWC)
- *
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] idx Index to write to
- * @param[in] center_x Center value of the x axis
- * @param[in] center_y Center value of the y axis
- * @param[in] box_width Prior box width
- * @param[in] box_height Prior box height
- *
- */
-inline void calculate_xy_min_max_nhwc(Tensor3D *out, int idx, float center_x, float center_y, float box_width, float box_height)
-{
- float xmin = (center_x - box_width / 2.f) / WIDTH;
- float ymin = (center_y - box_height / 2.f) / HEIGHT;
- float xmax = (center_x + box_width / 2.f) / WIDTH;
- float ymax = (center_y + box_height / 2.f) / HEIGHT;
-
-#if defined(CLIP)
- xmin = clamp(xmin, 0.f, 1.f);
- ymin = clamp(ymin, 0.f, 1.f);
- xmax = clamp(xmax, 0.f, 1.f);
- ymax = clamp(ymax, 0.f, 1.f);
-#endif // defined(CLIP)
-
- *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 0, 0)) = xmin;
- *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 1, 0)) = ymin;
- *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 2, 0)) = xmax;
- *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 3, 0)) = ymax;
-}
-
-/** Compute prior boxes (NHWC)
- *
- * @param[in,out] out Tensor output
- * @param[in] max The maximum values
- * @param[in] aspect_ratios The aspect ratio values
- * @param[in] max_size The maximum values values size
- * @param[in] aspect_ratios_size The aspect ratio values size
- * @param[in] min_size The minimum values size
- * @param[in] min_idx Index of the min vector
- * @param[in] idx Index to write to
- *
- * @return The updated index
- */
-inline int calculate_min_nhwc(Tensor3D *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
-{
- const float center_x = ((float)(get_global_id(1) % LAYER_WIDTH) + OFFSET) * STEP_X;
- const float center_y = ((float)(get_global_id(1) / LAYER_WIDTH) + OFFSET) * STEP_Y;
-
- float box_width = min_size;
- float box_height = min_size;
-
- calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
- idx += 4;
- if(max_size > 0)
- {
- box_width = sqrt(min_size * max[min_idx]);
- box_height = box_width;
- calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
- idx += 4;
- }
- for(unsigned int i = 0; i < aspect_ratios_size; ++i)
- {
- if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
- {
- continue;
- }
- box_width = min_size * sqrt(aspect_ratios[i]);
- box_height = min_size * rsqrt(aspect_ratios[i]);
-
- calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
- idx += 4;
- }
-
- return idx;
-}
-
/** Calculate prior boxes with NCHW format.
*
* @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
@@ -218,39 +136,4 @@
vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
}
}
-
-/** Calculate prior boxes with NHWC format.
- *
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] min The minimum values
- * @param[in] max The maximum_values
- * @param[in] aspect_ratios The aspect ratio values
- * @param[in] min_size The minimum values size
- * @param[in] max_size The maximum_values values size
- * @param[in] aspect_ratios_size The aspect ratio values size
- */
-__kernel void prior_box_layer_nhwc(TENSOR3D_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
- unsigned int aspect_ratios_size)
-{
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- int idx = 0;
- for(unsigned int i = 0; i < min_size; ++i)
- {
- idx = calculate_min_nhwc(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
- }
-
- for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
- {
- *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 0, 1)) = VARIANCE_0;
- *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 1, 1)) = VARIANCE_1;
- *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 2, 1)) = VARIANCE_2;
- *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 3, 1)) = VARIANCE_3;
- }
-}
#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */
diff --git a/src/core/CL/cl_kernels/range.cl b/src/core/CL/cl_kernels/range.cl
new file mode 100644
index 0000000..d122c9a
--- /dev/null
+++ b/src/core/CL/cl_kernels/range.cl
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE)
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=value. e.g. -DDATA_TYPE=4
+ *
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range(
+ VECTOR_DECLARATION(out))
+{
+ uint id = get_global_id(0) * VECTOR_SIZE;
+ __global void *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+ DATA_TYPE seq;
+ seq = (DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP;
+
+ *((__global DATA_TYPE *)dst_ptr) = seq;
+#else // VECTOR_SIZE == 1
+ VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+ seq;
+
+ seq.s0 = ((DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP);
+#if VECTOR_SIZE > 1
+ seq.s1 = seq.s0 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 2
+ seq.s2 = seq.s1 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 3
+ seq.s3 = seq.s2 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 4
+ seq.s4 = seq.s3 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 5
+ seq.s5 = seq.s4 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 6
+ seq.s6 = seq.s5 + (DATA_TYPE)STEP;
+#if VECTOR_SIZE > 7
+ seq.s7 = seq.s6 + (DATA_TYPE)STEP;
+#endif // VECTOR_SIZE > 7
+#endif // VECTOR_SIZE > 6
+#endif // VECTOR_SIZE > 5
+#endif // VECTOR_SIZE > 4
+#endif // VECTOR_SIZE > 3
+#endif // VECTOR_SIZE > 2
+#endif // VECTOR_SIZE > 1
+ VSTORE(VECTOR_SIZE)
+ (seq, 0, ((__global DATA_TYPE *)dst_ptr));
+#endif //VECTOR_SIZE == 1
+}
+
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=vector_size. e.g. -DDATA_TYPE=4
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ *
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: QASYMM8.
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range_quantized(
+ VECTOR_DECLARATION(out))
+{
+ size_t id = get_global_id(0) * VECTOR_SIZE;
+ __global void *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+ float seq;
+ seq = (float)START + (float)id * (float)STEP;
+ seq = (DATA_TYPE)(int)(seq / ((float)SCALE_OUT) + (float)OFFSET_OUT);
+ seq = max(0.0f, min(seq, 255.0f));
+ *((__global uchar *)dst_ptr) = CONVERT_SAT(CONVERT_DOWN(seq, int), uchar);
+#else // VECTOR_SIZE == 1
+ VEC_DATA_TYPE(float, VECTOR_SIZE)
+ seq;
+ seq.s0 = (float)START + id * (float)STEP;
+#if VECTOR_SIZE > 1
+ seq.s1 = seq.s0 + (float)STEP;
+#if VECTOR_SIZE > 2
+ seq.s2 = seq.s1 + (float)STEP;
+#if VECTOR_SIZE > 3
+ seq.s3 = seq.s2 + (float)STEP;
+#if VECTOR_SIZE > 4
+ seq.s4 = seq.s3 + (float)STEP;
+#if VECTOR_SIZE > 5
+ seq.s5 = seq.s4 + (float)STEP;
+#if VECTOR_SIZE > 6
+ seq.s6 = seq.s5 + (float)STEP;
+#if VECTOR_SIZE > 7
+ seq.s7 = seq.s6 + (float)STEP;
+#endif // VECTOR_SIZE > 7
+#endif // VECTOR_SIZE > 6
+#endif // VECTOR_SIZE > 5
+#endif // VECTOR_SIZE > 4
+#endif // VECTOR_SIZE > 3
+#endif // VECTOR_SIZE > 2
+#endif // VECTOR_SIZE > 1
+ seq = seq / ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)SCALE_OUT)) + ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)OFFSET_OUT));
+ seq = max((VEC_DATA_TYPE(float, VECTOR_SIZE))(0.0f), min(seq, (VEC_DATA_TYPE(float, VECTOR_SIZE))(255.0f)));
+ VEC_DATA_TYPE(uchar, VECTOR_SIZE)
+ res = CONVERT_SAT(CONVERT_DOWN(seq, VEC_DATA_TYPE(int, VECTOR_SIZE)), VEC_DATA_TYPE(uchar, VECTOR_SIZE));
+ VSTORE(VECTOR_SIZE)
+ (res, 0, ((__global DATA_TYPE *)dst_ptr));
+#endif // VECTOR_SIZE == 1
+}
+#endif // defined(OFFSET_OUT) && defined(SCALE_OUT)
+
+#endif // defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index d76e12a..b4ede25 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,11 +61,30 @@
return (in.s0 + in.s1);
}
+/** Calculate product of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return product of vector.
+ */
+inline DATA_TYPE product(__global const DATA_TYPE *input)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, input);
+
+ in.s01234567 *= in.s89ABCDEF;
+ in.s0123 *= in.s4567;
+ in.s01 *= in.s23;
+
+ return (in.s0 * in.s1);
+}
+#if defined(OPERATION)
/** This kernel performs parallel reduction given an operation on x-axis.
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
* @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
@@ -74,28 +93,28 @@
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] partial_sum_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
- * @param[in] partial_sum_stride_x Stride of the output tensor in X dimension (in bytes)
- * @param[in] partial_sum_step_x partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] partial_sum_stride_y Stride of the output tensor in Y dimension (in bytes)
- * @param[in] partial_sum_step_y partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_sums Local buffer for storing the partial sum
+ * @param[in] partial_res_ptr The local buffer to hold partial result values. Supported data types: same as @p src_ptr
+ * @param[in] partial_res_stride_x Stride of the output tensor in X dimension (in bytes)
+ * @param[in] partial_res_step_x partial_res_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] partial_res_step_y partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] local_results Local buffer for storing the partial result
*/
__kernel void reduction_operation_x(
IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(partial_sum),
- __local DATA_TYPE *local_sums)
+ IMAGE_DECLARATION(partial_res),
+ __local DATA_TYPE *local_results)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);
+ Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
unsigned int lsize = get_local_size(0);
unsigned int lid = get_local_id(0);
for(unsigned int y = 0; y < get_local_size(1); ++y)
{
- local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
+ local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
barrier(CLK_LOCAL_MEM_FENCE);
// Perform parallel reduction
@@ -103,7 +122,11 @@
{
if(lid < i)
{
- local_sums[lid] += local_sums[lid + i];
+#if defined(PROD)
+ local_results[lid] *= local_results[lid + i];
+#else //!defined(PROD)
+ local_results[lid] += local_results[lid + i];
+#endif //defined(PROD)
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -113,20 +136,24 @@
#if defined(MEAN) && defined(WIDTH)
if(y == get_local_size(1) - 1)
{
- local_sums[0] /= WIDTH;
+ local_results[0] /= WIDTH;
}
#endif /* defined(MEAN) && defined(WIDTH) */
- ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
+ ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
}
}
}
+#endif // defined(OPERATION)
#if defined(WIDTH)
-/** This kernel performs reduction on x-axis. (QASYMM8)
+/** This kernel performs reduction on x-axis. (Non parallel)
*
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ * @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
@@ -135,33 +162,49 @@
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
*/
-__kernel void reduction_operation_quantized_x(
+__kernel void reduction_operation_non_parallel_x(
VECTOR_DECLARATION(src),
VECTOR_DECLARATION(output))
{
Vector src = CONVERT_TO_VECTOR_STRUCT(src);
Vector output = CONVERT_TO_VECTOR_STRUCT(output);
- uint res = 0;
+ DATA_TYPE_PROMOTED res = *((__global DATA_TYPE *)vector_offset(&src, 0));
- for(unsigned int x = 0; x < WIDTH; ++x)
+#if defined(ARG_MAX) || defined(ARG_MIN)
+ uint indx = 0;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+ for(unsigned int x = 1; x < WIDTH; ++x)
{
- res += *((__global uchar *)vector_offset(&src, x));
+ DATA_TYPE_PROMOTED in = *((__global DATA_TYPE *)vector_offset(&src, x));
+#if defined(ARG_MAX)
+ indx = select(indx, x, isgreater(in, res));
+ res = select(res, in, CONVERT(isgreater(in, res), COND_DATA_TYPE));
+#elif defined(ARG_MIN)
+ indx = select(indx, x, isless(in, res));
+ res = select(res, in, CONVERT(isless(in, res), COND_DATA_TYPE));
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+ res += in;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
+ // Store result
+#if defined(ARG_MAX) || defined(ARG_MIN)
+ *((__global uint *)output.ptr) = indx;
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(MEAN)
res /= WIDTH;
-#endif /* defined(MEAN) */
-
- // Store result
+#endif // defined(MEAN)
*((__global uchar *)output.ptr) = convert_uchar(res);
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
-#endif /* defined(HEIGHT) */
+#endif /* defined(WIDTH) */
#if defined(HEIGHT)
/** This kernel performs reduction on y-axis.
*
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
@@ -185,24 +228,49 @@
Image output = CONVERT_TO_IMAGE_STRUCT(output);
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
- res = 0;
+ res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
- for(unsigned int y = 0; y < HEIGHT; ++y)
+#if defined(SUM_SQUARE)
+ res *= res;
+#endif // defined(SUM_SQUARE)
+
+#if defined(ARG_MAX) || defined(ARG_MIN)
+ uint16 indx = 0;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+ for(unsigned int y = 1; y < HEIGHT; ++y)
{
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(ARG_MAX)
+ uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+ indx = select(indx, y, cond_conv);
+ res = select(res, in, isgreater(in, res));
+#elif defined(ARG_MIN)
+ uint16 cond_conv = CONVERT(isless(in, res), uint16);
+ indx = select(indx, y, cond_conv);
+ res = select(res, in, isless(in, res));
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(SUM_SQUARE)
in *= in;
-#endif // SQRSUM
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+ res *= in;
+#else //!defined(PROD)
res += in;
+#endif //defined(PROD)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
+ // Store result
+#if defined(ARG_MAX) || defined(ARG_MIN)
+ vstore16(indx, 0, (__global uint *)output.ptr);
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(MEAN)
res /= HEIGHT;
-#endif /* defined(MEAN) */
-
- // Store result
+#endif // defined(MEAN)
vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
#endif /* defined(HEIGHT) */
@@ -237,24 +305,50 @@
Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
- res = 0;
+ res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
- for(unsigned int z = 0; z < DEPTH; ++z)
+#if defined(SUM_SQUARE)
+ res *= res;
+#endif // defined(SUM_SQUARE)
+
+#if defined(ARG_MAX) || defined(ARG_MIN)
+ uint16 indx = 0;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+ for(unsigned int z = 1; z < DEPTH; ++z)
{
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+
+#if defined(ARG_MAX)
+ uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+ indx = select(indx, z, cond_conv);
+ res = select(res, in, isgreater(in, res));
+#elif defined(ARG_MIN)
+ uint16 cond_conv = CONVERT(isless(in, res), uint16);
+ indx = select(indx, z, cond_conv);
+ res = select(res, in, isless(in, res));
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(SUM_SQUARE)
in *= in;
-#endif // SQRSUM
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+ res *= in;
+#else //!defined(PROD)
res += in;
+#endif //defined(PROD)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
+ // Store result
+#if defined(ARG_MAX) || defined(ARG_MIN)
+ vstore16(indx, 0, (__global uint *)output.ptr);
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(MEAN)
res /= DEPTH;
-#endif /* defined(MEAN) */
-
- // Store result
+#endif // defined(MEAN)
vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
#endif /* defined(DEPTH) */
@@ -294,23 +388,49 @@
Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
- res = 0;
+ res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
- for(unsigned int w = 0; w < BATCH; ++w)
+#if defined(SUM_SQUARE)
+ res *= res;
+#endif // defined(SUM_SQUARE)
+
+#if defined(ARG_MAX) || defined(ARG_MIN)
+ uint16 indx = 0;
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+ for(unsigned int w = 1; w < BATCH; ++w)
{
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+
+#if defined(ARG_MAX)
+ uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+ indx = select(indx, w, cond_conv);
+ res = select(res, in, isgreater(in, res));
+#elif defined(ARG_MIN)
+ uint16 cond_conv = CONVERT(isless(in, res), uint16);
+ indx = select(indx, w, cond_conv);
+ res = select(res, in, isless(in, res));
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(SUM_SQUARE)
in *= in;
-#endif // SQRSUM
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+ res *= in;
+#else //!defined(PROD)
res += in;
+#endif //defined(PROD)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
+ // Store result
+#if defined(ARG_MAX) || defined(ARG_MIN)
+ vstore16(indx, 0, (__global uint *)output.ptr);
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(MEAN)
res /= BATCH;
-#endif /* defined(MEAN) */
-
- // Store result
+#endif // defined(MEAN)
vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
-#endif /* defined(BATCH) && defined(DEPTH) */
\ No newline at end of file
+#endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
new file mode 100644
index 0000000..691f7ae
--- /dev/null
+++ b/src/core/CL/cl_kernels/repeat.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_REPEAT_H
+#define ARM_COMPUTE_REPEAT_H
+
+/** Macros that help in loop unrolling */
+//Repeat macros with 3 param, excluding the implicit ID param
+#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
+#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(1, P_A, P_B, P_C); \
+ REPEAT_3_1(P_X, P_A, P_B, P_C)
+#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(2, P_A, P_B, P_C); \
+ REPEAT_3_2(P_X, P_A, P_B, P_C)
+#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(3, P_A, P_B, P_C); \
+ REPEAT_3_3(P_X, P_A, P_B, P_C)
+#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(4, P_A, P_B, P_C); \
+ REPEAT_3_4(P_X, P_A, P_B, P_C)
+#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(5, P_A, P_B, P_C); \
+ REPEAT_3_5(P_X, P_A, P_B, P_C)
+#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(6, P_A, P_B, P_C); \
+ REPEAT_3_6(P_X, P_A, P_B, P_C)
+#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(7, P_A, P_B, P_C); \
+ REPEAT_3_7(P_X, P_A, P_B, P_C)
+#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(8, P_A, P_B, P_C); \
+ REPEAT_3_8(P_X, P_A, P_B, P_C)
+#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(9, P_A, P_B, P_C); \
+ REPEAT_3_9(P_X, P_A, P_B, P_C)
+#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(A, P_A, P_B, P_C); \
+ REPEAT_3_10(P_X, P_A, P_B, P_C)
+#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(B, P_A, P_B, P_C); \
+ REPEAT_3_11(P_X, P_A, P_B, P_C)
+#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(C, P_A, P_B, P_C); \
+ REPEAT_3_12(P_X, P_A, P_B, P_C)
+#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(D, P_A, P_B, P_C); \
+ REPEAT_3_13(P_X, P_A, P_B, P_C)
+#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(E, P_A, P_B, P_C); \
+ REPEAT_3_14(P_X, P_A, P_B, P_C)
+#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(F, P_A, P_B, P_C); \
+ REPEAT_3_15(P_X, P_A, P_B, P_C)
+
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
+
+//Macro for initializing N variables. generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
+
+#endif // ARM_COMPUTE_REPEAT_H
diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/reverse.cl
new file mode 100644
index 0000000..6afd382
--- /dev/null
+++ b/src/core/CL/cl_kernels/reverse.cl
@@ -0,0 +1,102 @@
+/*
+* Copyright (c) 2018 ARM Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
+
+#if NUM_REVERSE_DIMS > 4
+#error("Reversing more than 4 dimensions is not currently supported")
+#endif /* NUM_REVERSE_DIMS > 4 */
+
+/** Performs reverse along the specified axis.
+ *
+ * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
+ * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] axis_ptr Pointer to the source vector. Supported data types: U32
+ * @param[in] axis_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] axis_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] axis_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reverse(TENSOR4D_DECLARATION(src),
+ VECTOR_DECLARATION(axis),
+ TENSOR4D_DECLARATION(dst),
+ const uint width,
+ const uint height,
+ const uint depth,
+ const uint batches)
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, depth);
+ Vector axis = CONVERT_TO_VECTOR_STRUCT_NO_STEP(axis);
+ Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, depth);
+
+ const uint x_in = get_global_id(0);
+ const uint y_in = get_global_id(1);
+ const uint z_in = get_global_id(2) % depth;
+ const uint w_in = get_global_id(2) / depth;
+
+ const uint4 dims = (uint4)(0, 1, 2, 3);
+ int4 to_reverse = (int4)(0, 0, 0, 0);
+#if NUM_REVERSE_DIMS == 1
+ const uint index = *((__global uint *)axis.ptr);
+ to_reverse = (uint4)index == dims;
+#elif NUM_REVERSE_DIMS == 2
+ const uint2 indices = vload2(0, (__global uint *)axis.ptr);
+ to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
+#elif NUM_REVERSE_DIMS == 3
+ const uint2 indices01 = vload2(0, (__global uint *)axis.ptr);
+ const uint index2 = *((__global uint *)axis.ptr + 2);
+ to_reverse = ((uint4)indices01.s0 == dims) || ((uint4)indices01.s1 == dims) || ((uint4)index2 == dims);
+#else /* NUM_REVERSE_DIMS == 3 */
+ const uint4 indices = vload4(0, (__global uint *)axis.ptr);
+ to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
+#endif /* NUM_REVERSE_DIMS == 1 */
+ const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
+ const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
+ const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
+ const uint w_out = to_reverse.s3 ? batches - w_in - 1 : w_in;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&dst, x_out, y_out, z_out, w_out)) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/roi_align_layer.cl
index f52eb18..430369b 100644
--- a/src/core/CL/cl_kernels/roi_align_layer.cl
+++ b/src/core/CL/cl_kernels/roi_align_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -75,11 +75,17 @@
const float w2 = hy * lx;
const float w3 = ly * hx;
const float w4 = ly * lx;
-
- const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
- const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
- const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
- const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#if defined(NHWC)
+ const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+ const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+ const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+ const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else // !defined(NHWC)
+ const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+ const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+ const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+ const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
}
}
@@ -133,9 +139,15 @@
Image rois = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
- const int px = get_global_id(0);
- const int py = get_global_id(1);
- const int pw = get_global_id(2);
+#if defined(NHWC)
+ const int px = get_global_id(1);
+ const int py = get_global_id(2);
+ const int pw = get_global_id(0);
+#else // !defined(NHWC)
+ const int px = get_global_id(0);
+ const int py = get_global_id(1);
+ const int pw = get_global_id(2);
+#endif // defined(NHWC)
// Load roi parameters
// roi is laid out as follows { batch_index, x1, y1, x2, y2 }
@@ -161,7 +173,7 @@
const float2 roi_bin_grid = SAMPLING_RATIO;
#else // !defined(SAMPLING_RATIO)
// Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
- const float2 roi_bin_grid = ceil(bin_size - EPS_GRID);
+ const float2 roi_bin_grid = ceil(bin_size - EPS_GRID);
#endif // defined(SAMPLING_RATIO)
// Move input and output pointer across the fourth dimension
@@ -169,15 +181,20 @@
output.ptr += pw * output_stride_w;
for(int pz = 0; pz < MAX_DIM_Z; ++pz)
{
- *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_align_1x1(&input,
- region_start.x,
- bin_size.x,
- roi_bin_grid.x,
- region_end.x,
- region_start.y,
- bin_size.y,
- roi_bin_grid.y,
- region_end.y, pz);
+#if defined(NHWC)
+ __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else // !defined(NHWC)
+ __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+ *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+ region_start.x,
+ bin_size.x,
+ roi_bin_grid.x,
+ region_end.x,
+ region_start.y,
+ bin_size.y,
+ roi_bin_grid.y,
+ region_end.y, pz);
}
}
#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/roi_pooling_layer.cl
index 042b102..0cf296c 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/roi_pooling_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -105,10 +105,12 @@
* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the pooled region of the source image as specifed by ROI
- * @param[in] rois_ptr Pointer to the rois array. Layout: {x, y, width, height, batch_indx}
- * @param[in] rois_stride_x Stride of the rois array in X dimension (in bytes)
- * @param[in] rois_step_x rois_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] rois_offset_first_element_in_bytes The offset of the first element in the rois array
+ * @param[in] rois_ptr Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in] rois_stride_x Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in] rois_step_x Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in] rois_stride_y Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in] rois_step_y Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in] rois_offset_first_element_in_bytes The offset of the first element in the ROIs tensor
* @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32
* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
@@ -122,13 +124,13 @@
*/
__kernel void roi_pooling_layer(
TENSOR3D_DECLARATION(input),
- VECTOR_DECLARATION(rois),
+ IMAGE_DECLARATION(rois),
TENSOR3D_DECLARATION(output),
unsigned int input_stride_w, unsigned int output_stride_w)
{
// Get pixels pointer
Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
- Vector rois = CONVERT_TO_VECTOR_STRUCT_NO_STEP(rois);
+ Image rois = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
const int px = get_global_id(0);
@@ -136,12 +138,12 @@
const int pw = get_global_id(2);
// Load roi parameters
- // roi is laid out as follows:
- // { x, y, width, height, batch_index }
- const ushort4 roi = vload4(0, (__global ushort *)vector_offset(&rois, pw));
- const ushort roi_batch = *((__global ushort *)vector_offset(&rois, pw) + 4);
- const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
- const int2 roi_dims = convert_int2_sat(fmax(round(convert_float2(roi.s23) * (float)SPATIAL_SCALE), 1.f));
+ // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+ const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+ const VEC_DATA_TYPE(DATA_TYPE, 4)
+ roi = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+ const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
+ const int2 roi_dims = convert_int2_sat(fmax(round(convert_float2(roi.s23 - roi.s01) * (float)SPATIAL_SCALE), 1.f));
// Calculate pooled region start and end
const float2 spatial_indx = (float2)(px, py);
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index 744f28a..5ac6443 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -134,9 +134,11 @@
vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
}
+#if defined(DEPTH_OUT)
/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
*
* @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
*
* @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32.
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
@@ -160,28 +162,29 @@
* @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_nearest_neighbour_nhwc(
- TENSOR3D_DECLARATION(in),
- TENSOR3D_DECLARATION(out),
+ TENSOR4D_DECLARATION(in),
+ TENSOR4D_DECLARATION(out),
const float input_width,
const float input_height,
const float scale_x,
const float scale_y)
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
const float new_x = (get_global_id(1) + 0.5f) * scale_x;
- const float new_y = (get_global_id(2) + 0.5f) * scale_y;
+ const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y)));
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
}
/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
*
* @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
* @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
*
* @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32.
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
@@ -205,22 +208,22 @@
* @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_bilinear_nhwc(
- TENSOR3D_DECLARATION(in),
- TENSOR3D_DECLARATION(out),
+ TENSOR4D_DECLARATION(in),
+ TENSOR4D_DECLARATION(out),
const float input_width,
const float input_height,
const float scale_x,
const float scale_y)
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
#ifdef SAMPLING_POLICY_TOP_LEFT
const float new_x = get_global_id(1) * scale_x;
- const float new_y = get_global_id(2) * scale_y;
+ const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
#elif SAMPLING_POLICY_CENTER
const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
- const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f;
+ const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
#else /* SAMPLING_POLICY */
#error("Unsupported sampling policy");
#endif /* SAMPLING_POLICY */
@@ -241,10 +244,10 @@
clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
#endif /* BORDER_MODE_REPLICATE */
- float4 ins = (float4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1))));
+ float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
const float a = new_x - new_xf;
const float b = 1.f - a;
@@ -254,3 +257,4 @@
*((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
}
+#endif /* defined(DEPTH_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
index 3211e7e..86dbf60 100644
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ b/src/core/CL/cl_kernels/scale_quantized.cl
@@ -85,12 +85,14 @@
vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
}
+#if defined(DEPTH_OUT)
/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
*
* @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
* @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
* @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
* @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
*
* @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8.
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
@@ -114,22 +116,22 @@
* @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_bilinear_quantized_nhwc(
- TENSOR3D_DECLARATION(in),
- TENSOR3D_DECLARATION(out),
+ TENSOR4D_DECLARATION(in),
+ TENSOR4D_DECLARATION(out),
const float input_width,
const float input_height,
const float scale_x,
const float scale_y)
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
#ifdef SAMPLING_POLICY_TOP_LEFT
const float new_x = get_global_id(1) * scale_x;
- const float new_y = get_global_id(2) * scale_y;
+ const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
#elif SAMPLING_POLICY_CENTER
const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
- const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f;
+ const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
#else /* SAMPLING_POLICY */
#error("Unsupported sampling policy");
#endif /* SAMPLING_POLICY */
@@ -150,10 +152,10 @@
clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
#endif /* BORDER_MODE_REPLICATE */
- int4 ins = (int4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1))));
+ int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
const float a = new_x - new_xf;
const float b = 1.f - a;
@@ -167,3 +169,4 @@
*((__global DATA_TYPE *)out.ptr) = res;
}
+#endif /* defined(DEPTH_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/select.cl
new file mode 100644
index 0000000..d783ae2
--- /dev/null
+++ b/src/core/CL/cl_kernels/select.cl
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE)
+/** This function perform a select operation between two tensors when condition tensor has the same rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] c_ptr Pointer to the source tensor. Supported data types: U8
+ * @param[in] c_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] c_step_x c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] c_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] c_step_y c_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] c_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] c_step_z c_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] c_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] x_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] x_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] x_step_x x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] x_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] x_step_y x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] x_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] x_step_z x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] x_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] y_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] y_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] y_step_x y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] y_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_step_y y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] y_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] y_step_z y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] y_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_same_rank(
+ TENSOR3D_DECLARATION(c),
+ TENSOR3D_DECLARATION(x),
+ TENSOR3D_DECLARATION(y),
+ TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D c_t = CONVERT_TO_TENSOR3D_STRUCT(c);
+ Tensor3D x_t = CONVERT_TO_TENSOR3D_STRUCT(x);
+ Tensor3D y_t = CONVERT_TO_TENSOR3D_STRUCT(y);
+ Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load values
+ VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+ in_c = CONVERT((VLOAD(VEC_SIZE)(0, (__global uchar *)c_t.ptr)), VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+
+ // Calculate and store result
+ VSTORE(VEC_SIZE)
+ (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+}
+
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] c_ptr Pointer to the source tensor. Supported data types: U8
+ * @param[in] c_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] c_step_x c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] c_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] x_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] x_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] x_step_x x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] x_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] x_step_y x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] x_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] x_step_z x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] x_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] y_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] y_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] y_step_x y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] y_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_step_y y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] y_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] y_step_z y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] y_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_2(
+ VECTOR_DECLARATION(c),
+ TENSOR3D_DECLARATION(x),
+ TENSOR3D_DECLARATION(y),
+ TENSOR3D_DECLARATION(out))
+{
+ const int c_idx = get_global_id(1);
+
+ // Get pixels pointer
+ Vector c_t = CONVERT_TO_VECTOR_STRUCT_NO_STEP(c);
+ Tensor3D x_t = CONVERT_TO_TENSOR3D_STRUCT(x);
+ Tensor3D y_t = CONVERT_TO_TENSOR3D_STRUCT(y);
+ Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load values
+ VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+ in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+
+ // Calculate and store result
+ VSTORE(VEC_SIZE)
+ (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+}
+#endif /* defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) */
+
+#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE)
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] c_ptr Pointer to the source tensor. Supported data types: U8
+ * @param[in] c_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] c_step_x c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] c_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] x_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] x_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] x_step_x x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] x_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] x_step_y x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] x_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] x_step_z x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] x_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] y_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] y_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] y_step_x y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] y_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_step_y y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] y_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] y_step_z y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] y_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_n(
+ VECTOR_DECLARATION(c),
+ TENSOR3D_DECLARATION(x),
+ TENSOR3D_DECLARATION(y),
+ TENSOR3D_DECLARATION(out))
+{
+ const int c_idx = get_global_id(2) / DEPTH_SIZE;
+
+ // Get pixels pointer
+ Vector c_t = CONVERT_TO_VECTOR_STRUCT_NO_STEP(c);
+ Tensor3D x_t = CONVERT_TO_TENSOR3D_STRUCT(x);
+ Tensor3D y_t = CONVERT_TO_TENSOR3D_STRUCT(y);
+ Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load values
+ VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+ in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_t.ptr);
+
+ // Calculate and store result
+ VSTORE(VEC_SIZE)
+ (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+}
+#endif /* defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/slice_ops.cl
index bc3df47..97decee 100644
--- a/src/core/CL/cl_kernels/slice_ops.cl
+++ b/src/core/CL/cl_kernels/slice_ops.cl
@@ -64,7 +64,9 @@
int offset = 0;
// Offset X
-#if defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+#if defined(SHRINK_0)
+ input.ptr += (int)START_0 * input_stride_x;
+#elif defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
// Check if access on width gets out of bounds
// If it does shift access vector to access elements within bounds
const int xi = (int)(get_global_id(0) * VEC_SIZE);
@@ -77,20 +79,46 @@
#endif // defined(START_0) && defined(STRIDE_0)
// Offset Y
-#if defined(START_1) && defined(STRIDE_1)
+#if defined(SHRINK_1)
+ input.ptr += (int)START_1 * input_stride_y;
+#elif defined(START_1) && defined(STRIDE_1)
+#if defined(SHRINK_0)
+ offset = (int)START_1 + (int)get_global_id(0) * (int)STRIDE_1;
+#else // defined(SHRINK_0)
offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+#endif // defined(SHRINK_0)
input.ptr += offset * input_stride_y;
#endif // defined(START_1) && defined(STRIDE_1)
// Offset Z
-#if defined(START_2) && defined(STRIDE_2)
+#if defined(SHRINK_2)
+ input.ptr += (int)START_2 * input_stride_z;
+#elif defined(START_2) && defined(STRIDE_2)
+
+#if defined(SHRINK_1) && defined(SHRINK_0)
+ offset = (int)START_2 + (int)get_global_id(0) * (int)STRIDE_2;
+#elif defined(SHRINK_1) || defined(SHRINK_0)
+ offset = (int)START_2 + (int)get_global_id(1) * (int)STRIDE_2;
+#else // defined(SHRINK_1) && defined(SHRINK_0)
offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+#endif // defined(SHRINK_1) && defined(SHRINK_0)
+
input.ptr += offset * input_stride_z;
#endif // defined(START_2) && defined(STRIDE_2)
// Offset depth
-#if defined(START_3) && defined(STRIDE_3)
+#if defined(SHRINK_3)
+ input.ptr += (int)START_3 * input_stride_w;
+#elif defined(START_3) && defined(STRIDE_3)
+#if defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+ offset = (int)START_3 + (int)get_global_id(0) * (int)STRIDE_3;
+#elif !defined(SHRINK_2) && !defined(SHRINK_1) && !defined(SHRINK_0)
offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
+#elif(defined(SHRINK_0) && defined(SHRINK_1)) || (defined(SHRINK_1) && defined(SHRINK_2)) || (defined(SHRINK_0) && defined(SHRINK_2))
+ offset = (int)START_3 + (int)get_global_id(1) * (int)STRIDE_3;
+#else // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+ offset = (int)START_3 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_3;
+#endif // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
input.ptr += offset * input_stride_w;
#endif // defined(START_3) && defined(STRIDE_3)
diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/space_to_batch.cl
index d42a79d..79343d4 100644
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/src/core/CL/cl_kernels/space_to_batch.cl
@@ -23,7 +23,7 @@
*/
#include "helpers.h"
-#if defined(BATCH_SIZE) && defined(DATA_TYPE)
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
/** Calculate the space to batch conversion.
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -83,12 +83,15 @@
const int out_y = get_global_id(1);
const int z = get_global_id(2);
- if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
{
- const int r = (BATCH_SIZE / (block_x * block_y));
- const int w = batch_id % r;
- const int in_x = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
- const int in_y = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - pad_left_x;
+ const int in_y = pos_y - pad_left_y;
+
*((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
}
}
@@ -151,18 +154,21 @@
const int out_y = get_global_id(2);
const int z = get_global_id(0);
- if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
{
- const int r = (BATCH_SIZE / (block_x * block_y));
- const int w = batch_id % r;
- const int in_x = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
- const int in_y = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - pad_left_x;
+ const int in_y = pos_y - pad_left_y;
+
*((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
}
}
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
-#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
/** Calculate the space to batch conversion.
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -207,12 +213,15 @@
const int out_y = get_global_id(1);
const int z = get_global_id(2);
- if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
{
- const int r = (BATCH_SIZE / (block_x * block_y));
- const int w = batch_id % r;
- const int in_x = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
- const int in_y = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - PAD_LEFT_X;
+ const int in_y = pos_y - PAD_LEFT_Y;
+
*((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
}
}
@@ -260,13 +269,16 @@
const int out_y = get_global_id(2);
const int z = get_global_id(0);
- if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
{
- const int r = (BATCH_SIZE / (block_x * block_y));
- const int w = batch_id % r;
- const int in_x = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
- const int in_y = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - PAD_LEFT_X;
+ const int in_y = pos_y - PAD_LEFT_Y;
+
*((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
}
}
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/stack_layer.cl
new file mode 100644
index 0000000..bed6266
--- /dev/null
+++ b/src/core/CL/cl_kernels/stack_layer.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
+
+#if AXIS == 0
+#define X_DST (idx_input)
+#define Y_DST (x_src)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 1 // AXIS == 1
+#define X_DST (x_src)
+#define Y_DST (idx_input)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 2 // AXIS == 2
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (idx_input)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 3 // AXIS == 3
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (idx_input)
+#define K_DST (w_src)
+#elif AXIS == 4 // AXIS == 4
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (w_src)
+#define K_DST (idx_input)
+#else // AXIS not supported
+#error "Not supported axis"
+#endif // AXIS == 0
+
+/** OpenCL kernel to stack a rank-R tensor into one with rank-(R+1) along the axis dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note The dimension to stack the tensors along has to be passed at compile time using -DAXIS. i.e. -DAXIS=1
+ * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
+ * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] idx_input Index of the input tensor in the list of tensors to stack
+ */
+__kernel void stack_layer(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+ unsigned int idx_input)
+{
+ uint x_src = get_global_id(0);
+ uint y_src = get_global_id(1);
+ uint z_src = (get_global_id(2) % SRC_DIM2);
+ uint w_src = (get_global_id(2) / SRC_DIM2);
+
+ __global DATA_TYPE *src = (__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_src * sizeof(DATA_TYPE) + y_src * src_stride_y + z_src * src_stride_z + w_src * src_stride_w);
+
+ __global DATA_TYPE *dst = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + X_DST * sizeof(DATA_TYPE) + Y_DST * dst_stride_y + Z_DST * dst_stride_z + W_DST * dst_stride_w + K_DST *
+ dst_stride_w * (uint)DST_DIM3);
+
+ *dst = *src;
+}
+
+#undef X_DST
+#undef Y_DST
+#undef Z_DST
+#undef W_DST
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
diff --git a/src/core/CL/cl_kernels/tile.cl b/src/core/CL/cl_kernels/tile.cl
new file mode 100644
index 0000000..ae625d9
--- /dev/null
+++ b/src/core/CL/cl_kernels/tile.cl
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
+/** Perform a floor operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void tile(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+ Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+
+ // For all coordinates but x, each tile copies from the input
+ const int y = get_global_id(1);
+ const int z = get_global_id(2) % DST_DEPTH;
+ const int batch = get_global_id(2) / DST_DEPTH;
+
+#if defined(VEC_SIZE) && defined(OFFSET)
+ // If we are loading/storing multiple elements at time, we need to
+ // not exceed the input boundaries. The last threads need to backtrack
+ // of OFFSET elements. Those elements cumulates for previous tiles
+ const int id = (int)(get_global_id(0));
+ int x = id * VEC_SIZE;
+
+ // Shift x based on the previous offsets
+ const int tile_number = x / SRC_WIDTH;
+ x -= (tile_number) * OFFSET;
+ int x_input = x % SRC_WIDTH;
+
+ // Shift x based on being the last tile
+ const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH);
+ x -= last_tile * OFFSET;
+ x_input = x % SRC_WIDTH;
+ output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x;
+
+ // Update the input pointer
+ input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+
+ // Copy the data
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+ VSTORE(VEC_SIZE)
+ (data, 0, (__global DATA_TYPE *)output.ptr);
+#else // !defined(VEC_SIZE) || !defined(OFFSET)
+ const int x = get_global_id(0);
+
+ // Update the input pointer
+ input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+
+ *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(OFFSET)
+}
+#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index f52b027..e979978 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl
@@ -23,7 +23,15 @@
*/
#include "helpers.h"
+#if defined(FUSED_ACTIVATION)
+#include "activation_layer.cl"
+#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(FUSED_ACTIVATION) */
+
#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
*
* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -32,6 +40,10 @@
* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=int
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -86,6 +98,7 @@
float out00 = d00 + d01 + d02;
float out01 = d01 - d02 - d03;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
@@ -150,10 +163,12 @@
// Store the output tile
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = (DATA_TYPE)out00;
- *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = (DATA_TYPE)out01;
+ const const VEC_DATA_TYPE(DATA_TYPE, 2)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out00, out01), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+ vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -162,11 +177,12 @@
out10 += (DATA_TYPE)b;
out11 += (DATA_TYPE)b;
#endif // defined(HAS_BIAS)
-
- vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))((DATA_TYPE)out10, (DATA_TYPE)out11), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+ vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(VEC_SIZE) && VEC_SIZE == 4
/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
*
* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -230,6 +246,7 @@
float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
@@ -351,12 +368,14 @@
// Store the output tile
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = (DATA_TYPE)out00;
- *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = (DATA_TYPE)out01;
- *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = (DATA_TYPE)out02;
- *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = (DATA_TYPE)out03;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out00, (DATA_TYPE)out01, (DATA_TYPE)out02, (DATA_TYPE)out03), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -377,9 +396,9 @@
out32 += (float)b;
out33 += (float)b;
#endif // defined(HAS_BIAS)
- vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out10, (DATA_TYPE)out11, (DATA_TYPE)out12, (DATA_TYPE)out13), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
- vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out20, (DATA_TYPE)out21, (DATA_TYPE)out22, (DATA_TYPE)out23), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
- vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out30, (DATA_TYPE)out31, (DATA_TYPE)out32, (DATA_TYPE)out33), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
@@ -579,25 +598,29 @@
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#if defined(SRC_DEPTH)
int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-#else /* defined(SRC_DEPTH) */
+#else /* defined(SRC_DEPTH) */
int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
-#endif /* defined(SRC_DEPTH) */
+#endif /* defined(SRC_DEPTH) */
offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
// Store the 1x4 output tile
- *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = (DATA_TYPE)out00;
- *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = (DATA_TYPE)out01;
- *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = (DATA_TYPE)out02;
- *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = (DATA_TYPE)out03;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
+ *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
+ *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
+ *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;
#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
// Store the 4x1 output tile
int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
int mult_y = min(dst_size - offset, 1);
- *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = (DATA_TYPE)out00;
- *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = (DATA_TYPE)out01;
- *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = (DATA_TYPE)out02;
- *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = (DATA_TYPE)out03;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
// Get output address
#if defined(SRC_DEPTH)
@@ -609,22 +632,30 @@
int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
// Store the 4x4 output tile
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = (DATA_TYPE)out00;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = (DATA_TYPE)out01;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = (DATA_TYPE)out02;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = (DATA_TYPE)out03;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = (DATA_TYPE)out10;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = (DATA_TYPE)out11;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = (DATA_TYPE)out12;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = (DATA_TYPE)out13;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = (DATA_TYPE)out20;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = (DATA_TYPE)out21;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = (DATA_TYPE)out22;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = (DATA_TYPE)out23;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = (DATA_TYPE)out30;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = (DATA_TYPE)out31;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = (DATA_TYPE)out32;
- *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = (DATA_TYPE)out33;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out1_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out2_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out3_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
}
@@ -690,6 +721,7 @@
Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
#else /* defined(SRC_DEPTH) */
+
Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
#endif /* defined(SRC_DEPTH) */
@@ -706,6 +738,7 @@
#if defined(SRC_DEPTH)
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
#else /* defined(SRC_DEPTH) */
+
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
#endif /* defined(SRC_DEPTH) */
@@ -740,15 +773,18 @@
// Store the output tile
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
- *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
- *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;
- *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr));
+ vstore4(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4))), 0, (__global DATA_TYPE *)(dst_addr));
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
@@ -859,10 +895,10 @@
#endif // defined(HAS_BIAS)
// Store the output tile
- vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s0, (DATA_TYPE)out_col1.s0, (DATA_TYPE)out_col2.s0, (DATA_TYPE)out_col3.s0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
- vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s1, (DATA_TYPE)out_col1.s1, (DATA_TYPE)out_col2.s1, (DATA_TYPE)out_col3.s1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
- vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s2, (DATA_TYPE)out_col1.s2, (DATA_TYPE)out_col2.s2, (DATA_TYPE)out_col3.s2), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
- vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s3, (DATA_TYPE)out_col1.s3, (DATA_TYPE)out_col2.s3, (DATA_TYPE)out_col3.s3), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0)), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1)), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2)), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC((VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3)), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
@@ -960,18 +996,21 @@
#endif /* defined(SRC_DEPTH) */
offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
- *(__global DATA_TYPE *)(dst_ptr + offset.s0) = (DATA_TYPE)out00;
- *(__global DATA_TYPE *)(dst_ptr + offset.s1) = (DATA_TYPE)out01;
- *(__global DATA_TYPE *)(dst_ptr + offset.s2) = (DATA_TYPE)out02;
- *(__global DATA_TYPE *)(dst_ptr + offset.s3) = (DATA_TYPE)out03;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
+ *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;
+ *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
// Get output address
int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-
- *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = (DATA_TYPE)out00;
- *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = (DATA_TYPE)out01;
- *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = (DATA_TYPE)out02;
- *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = (DATA_TYPE)out03;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)));
+ *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
+ *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;
+ *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -1094,26 +1133,37 @@
int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
// Store the output tile
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col0.s0;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col1.s0;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col2.s0;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col3.s0;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col0.s1;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col1.s1;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col2.s1;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col3.s1;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col0.s2;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col1.s2;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col2.s2;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col3.s2;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col0.s3;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col1.s3;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col2.s3;
- *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col3.s3;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_col2_dt = ACTIVATION_FUNC(CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_col3_dt = ACTIVATION_FUNC(CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
*
* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -1181,7 +1231,9 @@
#endif // defined(HAS_BIAS)
);
}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(VEC_SIZE) && VEC_SIZE == 4
/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
*
* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -1449,9 +1501,11 @@
#endif // defined(HAS_BIAS)
dst_size);
}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
*
* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -1519,7 +1573,9 @@
#endif // defined(HAS_BIAS)
);
}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+#if defined(VEC_SIZE) && VEC_SIZE == 4
/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
*
* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -1787,5 +1843,6 @@
#endif // defined(HAS_BIAS)
dst_size);
}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 73a4d7d..100184d 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -129,24 +129,25 @@
b_const_int = input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
}
+ const bool is_logistic_activation_quantized = is_data_type_quantized_asymmetric(dt) && act_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC;
// Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DACT=" + lower_string(string_from_activation_func(act_info.activation()))));
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
- build_opts.emplace(("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt)));
- build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(!is_logistic_activation_quantized, "-DACT=" + lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+ build_opts.add_option(("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt)));
+ build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
if(is_data_type_quantized(dt))
{
- build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
- build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+ build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+ build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
const int o1 = input->info()->quantization_info().offset;
const float s1 = input->info()->quantization_info().scale;
// Quantized value of 0 corresponds to the offset o1
- build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
- build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
- build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+ build_opts.add_option(("-DCONST_0=" + support::cpp11::to_string(o1)));
+ build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
+ build_opts.add_option(("-DO1_VAL=" + support::cpp11::to_string(o1)));
// Set scale and offset of the input and output if they have different quantization info
if(is_data_type_quantized_asymmetric(dt) && output != nullptr)
@@ -156,22 +157,26 @@
if(o1 != o2 || s1 != s2)
{
- build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
- build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+ build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
+ build_opts.add_option(("-DO2_VAL=" + support::cpp11::to_string(o2)));
}
}
}
else
{
- build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
- build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+ build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+ build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
}
- build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
+ build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
// Create kernel
- std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ std::string kernel_name = std::string("activation_layer");
+ if(is_data_type_quantized_asymmetric(dt))
+ {
+ kernel_name += is_logistic_activation_quantized ? std::string("_logistic_qa8") : std::string("_qa8");
+ }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Make sure _kernel is initialized before calling the parent's configure
_input = input;
diff --git a/src/core/CL/kernels/CLArithmeticDivisionKernel.cpp b/src/core/CL/kernels/CLArithmeticDivisionKernel.cpp
deleted file mode 100644
index e995ba1..0000000
--- a/src/core/CL/kernels/CLArithmeticDivisionKernel.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-
- const TensorShape out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output, out_shape);
-
- if(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16)
- {
- set_format_if_unknown(*output, Format::F16);
- }
- else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output, Format::F32);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
- AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLArithmeticDivisionKernel::CLArithmeticDivisionKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLArithmeticDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- // Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_div", build_opts));
-
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLArithmeticDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void CLArithmeticDivisionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- bool can_collapse = true;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
- const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
-
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLArithmeticDivisionKernel::border_size() const
-{
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
deleted file mode 100644
index 95d2011..0000000
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include <set>
-#include <string>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
-{
- ARM_COMPUTE_UNUSED(policy);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
- const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) || is_data_type_quantized_asymmetric(input2.data_type());
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
- }
-
- const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if(output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
- "Output can only be U8 if both inputs are U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
- "Wrong shape for output");
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
- }
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
-{
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
- const TensorShape &out_shape = broadcast_pair.first;
- const ValidRegion &valid_region = broadcast_pair.second;
-
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(output, out_shape);
-
- if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
- {
- set_format_if_unknown(output, Format::S16);
- }
- else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
- {
- set_format_if_unknown(output, Format::F16);
- }
- else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
- {
- set_format_if_unknown(output, Format::F32);
- }
- }
-
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(input2);
-
- AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
-
- bool has_float_out = is_data_type_float(output->info()->data_type());
-
- // Setup kernel
- std::string kernel_name = "arithmetic_sub";
-
- // Set kernel build options
- CLBuildOptions build_opts;
- build_opts.add_option_if_else(policy == ConvertPolicy::WRAP || has_float_out, "-DWRAP", "-DSATURATE");
- build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
- {
- build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
- build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().offset));
- build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
- build_opts.add_option("-DSCALE_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().scale));
- build_opts.add_option("-DSCALE_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().scale));
- build_opts.add_option("-DSCALE_OUT=" + support::cpp11::to_string(output->info()->quantization_info().scale));
- kernel_name += "_quantized";
- }
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
- // Configure kernel window
- ICLKernel::configure_internal(win_config.second);
-}
-
-Status CLArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
-
- return Status{};
-}
-
-void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- const TensorShape &in_shape1 = _input1->info()->tensor_shape();
- const TensorShape &in_shape2 = _input2->info()->tensor_shape();
- const TensorShape &out_shape = _output->info()->tensor_shape();
-
- // Collapse only if broadcast dimensions is less than 2, or in case of no broadcasting
- bool can_collapse = true;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
- {
- can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
- for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
- {
- can_collapse = (in_shape1[d] == in_shape2[d]);
- }
- }
-
- bool has_collapsed = false;
- Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
- const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
- const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
- Window slice = collapsed.first_slice_window_3D();
- Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
- Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
- do
- {
- unsigned int idx = 0;
-
- add_3D_tensor_argument(idx, _input1, slice_input1);
- add_3D_tensor_argument(idx, _input2, slice_input2);
- add_3D_tensor_argument(idx, _output, slice);
-
- enqueue(queue, *this, slice);
-
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- }
- while(collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLArithmeticSubtractionKernel::border_size() const
-{
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
-}
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index bff28e3..7c30a94 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,6 +42,7 @@
Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(boxes, DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(deltas, DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON(deltas->tensor_shape()[1] != boxes->tensor_shape()[1]);
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index 53a5456..f232f6c 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,6 +56,7 @@
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
similarity index 67%
rename from src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
rename to src/core/CL/kernels/CLComparisonKernel.cpp
index 10d7fd4..f5f5a0f 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,49 +21,55 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-using namespace arm_compute;
+#include <map>
+namespace arm_compute
+{
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 8;
-
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
+// Create supported comparisons map
+const std::map<ComparisonOperation, std::string> supported_comparison_ops =
{
- ARM_COMPUTE_UNUSED(policy);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ { ComparisonOperation::Equal, "EQUAL" },
+ { ComparisonOperation::NotEqual, "NOTEQUAL" },
+ { ComparisonOperation::Greater, "GREATER" },
+ { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
+ { ComparisonOperation::Less, "LESS" },
+ { ComparisonOperation::LessEqual, "LESSEQUAL" },
+};
- const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) || is_data_type_quantized_asymmetric(input2.data_type());
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
- }
+int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
+{
+ return 16 / input.element_size();
+}
+
+Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1,
+ 1,
+ DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+ ARM_COMPUTE_RETURN_ERROR_ON(supported_comparison_ops.count(operation) == 0);
const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
// Validate in case of configured output
if(output.total_size() > 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
- "Output can only be U8 if both inputs are U8");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
"Wrong shape for output");
- if(is_qasymm)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
- }
}
return Status{};
@@ -75,23 +81,10 @@
const TensorShape &out_shape = broadcast_pair.first;
const ValidRegion &valid_region = broadcast_pair.second;
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(output, out_shape);
+ const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
- if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
- {
- set_format_if_unknown(output, Format::S16);
- }
- else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
- {
- set_format_if_unknown(output, Format::F16);
- }
- else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
- {
- set_format_if_unknown(output, Format::F32);
- }
- }
+ // Auto initialize output if not initialized
+ auto_init_if_empty(output, out_shape, 1, DataType::U8, QuantizationInfo());
Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
Window win_input1 = win.broadcast_if_dimension_le_one(input1);
@@ -112,15 +105,15 @@
}
} // namespace
-CLArithmeticAdditionKernel::CLArithmeticAdditionKernel()
+CLComparisonKernel::CLComparisonKernel()
: _input1(nullptr), _input2(nullptr), _output(nullptr)
{
}
-void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
// Configure kernel window
auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
@@ -130,25 +123,21 @@
_input2 = input2;
_output = output;
- const bool has_float_out = is_data_type_float(output->info()->data_type());
-
- std::string kernel_name = "arithmetic_add";
+ const std::string &operation_name = supported_comparison_ops.at(operation);
+ std::string kernel_name = "compare_" + lower_string(operation_name);
// Set kernel build options
std::set<std::string> build_opts;
- build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
- build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+ build_opts.emplace("-DOP=" + operation_name);
+ build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
{
build_opts.emplace("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
build_opts.emplace("-DOFFSET_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().offset));
- build_opts.emplace("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
- build_opts.emplace("-DSCALE_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().scale));
- build_opts.emplace("-DSCALE_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().scale));
- build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string(output->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
+ build_opts.emplace("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
kernel_name += "_quantized";
}
@@ -165,21 +154,20 @@
_config_id += support::cpp11::to_string(output->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += (policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
_config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
}
-Status CLArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
return Status{};
}
-void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
@@ -225,9 +213,12 @@
while(collapsed.slide_window_slice_3D(slice));
}
-BorderSize CLArithmeticAdditionKernel::border_size() const
+BorderSize CLComparisonKernel::border_size() const
{
+ const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
+
const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
return BorderSize(0, border, 0, 0);
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index dd7d790..70337be 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -42,9 +43,10 @@
const PadStrideInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
const DataLayout data_layout = input->data_layout();
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 4002394..3fccc04 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
@@ -113,6 +113,13 @@
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+ {
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts.options()));
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
index ffbd295..e188ee9 100644
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,8 +37,8 @@
#include <set>
#include <string>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
@@ -46,42 +46,20 @@
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON(input == output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16,
- DataType::U16, DataType::U32, DataType::S32,
- DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16,
- DataType::U16, DataType::U32, DataType::S32,
- DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input,
+ 1,
+ DataType::U8, DataType::S8, DataType::S16,
+ DataType::U16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
+ 1,
+ DataType::U8, DataType::S8, DataType::S16,
+ DataType::U16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(), "Input and output data types must be different");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_float(input->data_type()) && shift != 0, "Shift is used only with integer inputs");
ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
- // Check if convertion is supported
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::U16 && output->data_type() != DataType::S16
- && output->data_type() != DataType::U32 && output->data_type() != DataType::S32),
- "Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32
- && output->data_type() != DataType::S32),
- "Only data types supported [in] U16 -> [out] U8, U32, S32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32
- && output->data_type() != DataType::S32),
- "Only data types supported [in] S16 -> [out] U8, U32, S32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U32 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U16
- && output->data_type() != DataType::S16),
- "Only data types supported [in] U32 -> [out] U8, U16, S16");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U16
- && output->data_type() != DataType::S16),
- "Only data types supported [in] S32 -> [out] U8, U16, S16");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && output->data_type() != DataType::F32,
- "Only data types supported [in] F16 -> [out] F32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && output->data_type() != DataType::F16,
- "Only data types supported [in] F32 -> [out] F16");
-
// Validate in case of configured output
if(output->total_size() > 0)
{
@@ -105,25 +83,33 @@
const size_t input_size = data_size_from_type(input->info()->data_type());
const size_t output_size = data_size_from_type(output->info()->data_type());
+ // Get number of elements to process per iterations
+ const unsigned int num_elems_processed_per_iteration = 16;
+
// Set build options
CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- // Down conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
- build_opts.add_option_if(input_size > output_size, ((policy == ConvertPolicy::WRAP) && !is_data_type_float(input->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
- build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DIS_DATA_TYPE_FLOAT");
+ // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
+ build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
+ build_opts.add_option_if(is_data_type_float(input->info()->data_type()) || is_data_type_float(output->info()->data_type()), "-DIS_DATA_TYPE_FLOAT");
// Create kernel
- const std::string kernel_name = (input_size > output_size) ? "convert_depth_down" : "convert_depth_up";
+ const std::string kernel_name = (input_size >= output_size) ? "convert_depth_down" : "convert_depth_up";
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Set shift arg
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
_kernel.setArg(idx++, shift);
// Configure kernel
- constexpr unsigned int num_elems_processed_per_iteration = 16;
- ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+ ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
+
+ // Collapse window
+ const Window &full_window = window();
+ Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
+ ICLKernel::configure_internal(collapsed_window);
}
Status CLDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
@@ -132,3 +118,4 @@
return Status{};
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 1fce14f..5e5a35c 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,9 +37,8 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
+namespace arm_compute
+{
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
@@ -54,9 +53,24 @@
"For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); //COMPMID-1317 add fused activation for F32
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) != 3 || weights->dimension(2) != 3);
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
+
+ const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
+ const size_t weights_width = 3;
+ const size_t weights_height = 3;
+
+ if(is_qasymm)
+ {
+ DepthwiseConvolutionReshapeInfo info;
+ info.c0 = 4;
+ ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(0) / info.c0) != weights_width * weights_height);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(1) != weights_width) || (weights->dimension(2) != weights_height));
+ }
if(biases != nullptr)
{
@@ -66,15 +80,16 @@
}
else
{
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
}
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
+
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
}
if(output->total_size() != 0)
{
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, weights_width, weights_height, conv_info, depth_multiplier);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
}
@@ -82,10 +97,13 @@
}
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
- const PadStrideInfo &conv_info)
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
+ const size_t weights_width = 3;
+ const size_t weights_height = 3;
+
// Get convolved dimensions
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, 1 /* depth_multiplier */);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, weights_width, weights_height, conv_info, depth_multiplier);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output,
@@ -110,10 +128,19 @@
AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->dimension(0), num_elems_accessed_per_iteration),
ceil_to_multiple(input->dimension(1) + border_size.bottom, num_rows_read_per_iteration));
- AccessWindowRectangle output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
- AccessWindowHorizontal weights_access(weights, 0, num_elems_accessed_per_iteration);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
- bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+ bool window_changed = false;
+
+ if(is_qasymm)
+ {
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ }
+ else
+ {
+ AccessWindowStatic weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_accessed_per_iteration), weights->dimension(1));
+ window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+ }
if(bias != nullptr)
{
@@ -142,22 +169,9 @@
unsigned int depth_multiplier, ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
- // Get convolved dimensions
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(),
- output_shape,
- 1,
- input->info()->data_type(),
- input->info()->quantization_info());
-
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
-
- const unsigned int conv_stride_x = conv_info.stride().first;
- ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 2);
- ARM_COMPUTE_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
+ auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
const bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
@@ -243,7 +257,7 @@
}
else
{
- build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+ build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
}
build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
@@ -252,13 +266,8 @@
// Create kernel
std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
&& is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
-
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Set config_id for enabling LWS tuning
_config_id = kernel_name;
@@ -283,7 +292,7 @@
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
biases != nullptr ? biases->clone().get() : nullptr,
- output->clone().get(), conv_info)
+ output->clone().get(), conv_info, depth_multiplier)
.first);
return Status{};
@@ -297,6 +306,7 @@
// Collapse window
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3);
+ const bool is_qasymm = is_data_type_quantized_asymmetric(_input->info()->data_type());
Window win = window_collapsed;
win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
@@ -311,7 +321,7 @@
Window slice_in = win_in.first_slice_window_4D();
Window slice_out = win.first_slice_window_4D();
- unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
+ unsigned int idx = 2 * num_arguments_per_4D_tensor() + (is_qasymm ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
if(_biases != nullptr)
{
@@ -330,9 +340,16 @@
unsigned int idx = 0;
add_4D_tensor_argument(idx, _input, slice_in);
add_4D_tensor_argument(idx, _output, slice_out);
- add_3D_tensor_argument(idx, _weights, slice_out);
-
+ if(is_qasymm)
+ {
+ add_2D_tensor_argument(idx, _weights, slice_out);
+ }
+ else
+ {
+ add_3D_tensor_argument(idx, _weights, slice_out);
+ }
enqueue(queue, *this, slice_out, lws_hint());
}
while(win.slide_window_slice_4D(slice_out) && win_in.slide_window_slice_4D(slice_in));
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.cpp
similarity index 85%
rename from src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
rename to src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.cpp
index 683dda8..b73ccf5 100644
--- a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -49,6 +49,7 @@
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (biases != nullptr));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(idx_w) * input->dimension(idx_h) + ((biases != nullptr) ? 1 : 0)));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
if(biases != nullptr)
{
@@ -61,12 +62,12 @@
}
} // namespace
-CLDepthwiseWeightsReshapeKernel::CLDepthwiseWeightsReshapeKernel()
+CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel()
: _input(nullptr), _biases(nullptr), _output(nullptr)
{
}
-void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases)
+void CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), (biases != nullptr) ? biases->info() : nullptr));
@@ -88,23 +89,23 @@
build_opts.emplace("-DHAS_BIAS");
}
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_weights_reshape", build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_convolution_reshape_weights_generic", build_opts));
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
- // The CLDepthwiseWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+ // The CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel doesn't need padding so update_window_and_padding() can be skipped
output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure_internal(win);
}
-Status CLDepthwiseWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+Status CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, biases));
return Status{};
}
-void CLDepthwiseWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
new file mode 100644
index 0000000..6b6438a
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
+{
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.c0 != 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_h) != 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != 3);
+
+ if(output->total_size() != 0)
+ {
+ auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), reshaped_weights_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
+{
+ auto reshaped_input_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*input, info);
+ auto_init_if_empty(*output, reshaped_input_shape, 1, input->data_type(), input->quantization_info());
+
+ Window win = calculate_max_window(*input, Steps(info.c0));
+ AccessWindowHorizontal weights_access(input, 0, info.c0);
+ const bool window_changed = update_window_and_padding(win, weights_access);
+
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLDepthwiseConvolutionLayerReshapeWeightsKernel::CLDepthwiseConvolutionLayerReshapeWeightsKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info));
+ auto win_config = validate_and_configure_window(input->info(), output->info(), info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ ICLKernel::configure_internal(win_config.second);
+
+ _input = input;
+ _output = output;
+
+ // Build the kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(info.c0));
+ build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(0)));
+ build_opts.add_option_if(info.transpose, "-DTRANSPOSE");
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_convolution_reshape_weights", build_opts.options()));
+}
+
+Status CLDepthwiseConvolutionLayerReshapeWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info).first);
+ return Status{};
+}
+
+void CLDepthwiseConvolutionLayerReshapeWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, window);
+ add_2D_tensor_argument(idx, _output, window);
+ enqueue(queue, *this, window);
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index d5c333a..56e9db5 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,6 +56,7 @@
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != output->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
return Status{};
}
diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
index cdc27e8..2dad729 100644
--- a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,6 +49,7 @@
TensorShape output_shape = compute_vector_to_tensor_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
index 5f4dacb..22149b4 100644
--- a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,7 +42,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F16,
DataType::F32);
if(bias != nullptr)
diff --git a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
new file mode 100644
index 0000000..be3c7e2
--- /dev/null
+++ b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32);
+
+ // Validate in case of configured output
+ if(output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+void CLElementWiseUnaryLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
+
+ // Configure kernel window
+ _input = input;
+ _output = output;
+
+ const std::string kernel_name = "elementwise_unary";
+ const int vec_size_x = 16 / output->info()->element_size();
+ const int output_width_x = output->info()->tensor_shape().x();
+ const bool multi_access_x = (output_width_x / vec_size_x > 0);
+
+ Window win = calculate_max_window(*output->info());
+ if(multi_access_x)
+ {
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
+
+ // Set kernel build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ switch(op)
+ {
+ case ElementWiseUnary::RSQRT:
+ build_opts.add_option("-DOPERATION=inverse_sqrt");
+ break;
+ case ElementWiseUnary::EXP:
+ build_opts.add_option("-DOPERATION=exponential");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+}
+
+Status CLElementWiseUnaryLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ElementWiseUnary &op)
+{
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+
+ return Status{};
+}
+
+void CLElementWiseUnaryLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, collapsed);
+ add_1D_tensor_argument(idx, _output, collapsed);
+ enqueue(queue, *this, collapsed);
+ }
+ while(window.slide_window_slice_1D(collapsed));
+}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
new file mode 100644
index 0000000..37eeeb7
--- /dev/null
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include <map>
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
+{
+ { ArithmeticOperation::ADD, "ADD" },
+ { ArithmeticOperation::SUB, "SUB" },
+ { ArithmeticOperation::DIV, "DIV" },
+ { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
+ { ArithmeticOperation::MIN, "MIN" },
+ { ArithmeticOperation::MAX, "MAX" },
+};
+
+std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
+{
+ { ArithmeticOperation::ADD, "ADD" },
+ { ArithmeticOperation::SUB, "SUB" },
+};
+
+std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+{
+ std::string config_id;
+ // Set config_id for enabling LWS tuning
+ config_id = kernel_name;
+ config_id += "_";
+ config_id += lower_string(string_from_data_type(input1.data_type()));
+ config_id += "_";
+ config_id += support::cpp11::to_string(output.dimension(0));
+ config_id += "_";
+ config_id += support::cpp11::to_string(output.dimension(1));
+ return config_id;
+}
+
+Status validate_arguments_with_division_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&input1, &input2, &output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+
+ const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+ // Validate in case of configured output
+ if(output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+
+ return Status{};
+}
+
+Status validate_arguments_with_arithmetic_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+
+ const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) || is_data_type_quantized_asymmetric(input2.data_type());
+ if(is_qasymm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+ }
+
+ const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+ // Validate in case of configured output
+ if(output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
+ "Output can only be U8 if both inputs are U8");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+ "Wrong shape for output");
+ if(is_qasymm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
+ }
+ }
+ return Status{};
+}
+
+CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, const std::string &operation_string)
+{
+ CLBuildOptions build_opts;
+
+ build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1.data_type()));
+ build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2.data_type()));
+ build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output.data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DOP=" + operation_string);
+ if(is_data_type_quantized_asymmetric(input1.data_type()))
+ {
+ build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(input1.quantization_info().offset));
+ build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(input2.quantization_info().offset));
+ build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output.quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1.quantization_info().scale));
+ build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2.quantization_info().scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output.quantization_info().scale));
+ }
+ return build_opts;
+}
+
+std::pair<Status, Window> configure_window_arithmetic_common(const ValidRegion &valid_region, ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(input1);
+ Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+
+ AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access)
+ || update_window_and_padding(win_input2, input2_access)
+ || update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ set_shape_if_empty(output, out_shape);
+
+ if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
+ {
+ set_format_if_unknown(output, Format::S16);
+ }
+ else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
+ {
+ set_format_if_unknown(output, Format::F16);
+ }
+ else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
+ {
+ set_format_if_unknown(output, Format::F32);
+ }
+
+ return configure_window_arithmetic_common(valid_region, input1, input2, output);
+}
+
+std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+ auto_init_if_empty(output, out_shape, 1, input1.data_type());
+ return configure_window_arithmetic_common(valid_region, input1, input2, output);
+}
+} // namespace
+
+CLElementwiseOperationKernel::CLElementwiseOperationKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLElementwiseOperationKernel::configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ std::string kernel_name = "elementwise_operation_" + name();
+ if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
+ {
+ kernel_name += "_quantized";
+ }
+
+ // Set kernel build options
+ CLBuildOptions build_opts = generate_build_options(*input1->info(), *input2->info(), *output->info());
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ ICLKernel::configure_internal(win_config.second);
+
+ _config_id = generate_id_for_tuning(kernel_name, *input1->info(), *output->info());
+}
+
+void CLElementwiseOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
+ if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+ {
+ can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+ const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice, lws_hint());
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLElementwiseOperationKernel::border_size() const
+{
+ const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
+
+/** Arithmetic operations with saturation*/
+
+void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy)
+{
+ _policy = policy;
+ _op = op;
+ configure_common(input1, input2, output);
+}
+
+Status CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy)
+{
+ ARM_COMPUTE_UNUSED(op, policy);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
+
+ return Status{};
+}
+
+std::pair<Status, Window> CLSaturatedArithmeticOperationKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+ return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
+}
+
+Status CLSaturatedArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ return validate_arguments_with_arithmetic_rules(input1, input2, output);
+}
+
+CLBuildOptions CLSaturatedArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ const bool has_float_out = is_data_type_float(output.data_type());
+ auto build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
+ build_options.add_option((_policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+ return build_options;
+}
+std::string CLSaturatedArithmeticOperationKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+{
+ auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
+ config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
+ config_id += lower_string(string_from_data_layout(input1.data_layout()));
+ return config_id;
+}
+
+std::string CLSaturatedArithmeticOperationKernel::name()
+{
+ return supported_sat_arithmetic_ops[_op];
+}
+
+/** Arithmetic operations*/
+
+void CLArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ _op = op;
+ configure_common(input1, input2, output);
+}
+
+Status CLArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ if(op == ArithmeticOperation::DIV)
+ {
+ // Division doesn't support integer arithmetic
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_division_rules(*input1, *input2, *output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*input1->clone(), *input2->clone(), *output->clone()).first);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
+ }
+
+ return Status{};
+}
+std::pair<Status, Window> CLArithmeticOperationKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+ if(_op == ArithmeticOperation::DIV)
+ {
+ // Division doesn't support integer arithmetic
+ return validate_and_configure_window_for_division(input1, input2, output);
+ }
+ else
+ {
+ return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
+ }
+}
+Status CLArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ if(_op == ArithmeticOperation::DIV)
+ {
+ // Division doesn't support integer arithmetic
+ return validate_arguments_with_division_rules(input1, input2, output);
+ }
+ else
+ {
+ return validate_arguments_with_arithmetic_rules(input1, input2, output);
+ }
+}
+
+CLBuildOptions CLArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ return generate_build_options_with_arithmetic_rules(input1, input2, output, name());
+}
+std::string CLArithmeticOperationKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+{
+ return generate_id_for_tuning_common(kernel_name, input1, output);
+}
+
+std::string CLArithmeticOperationKernel::name()
+{
+ return supported_arithmetic_ops[_op];
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 6920667..5fdb826 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -75,25 +75,18 @@
// Select appropriate kernel
std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
- // Define select type required by replicate border > 1
- const DataType dt = tensor->info()->data_type();
- std::string select_type = get_underlying_cl_type_from_data_type(dt);
- if(is_data_type_float(dt))
- {
- select_type = (DataType::F32 == dt) ? "int" : "short";
- }
+ const DataType dt = tensor->info()->data_type();
// Define build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt)));
- build_opts.emplace(("-DSELECT_TYPE=" + select_type));
- build_opts.emplace(("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top)));
- build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom)));
- build_opts.emplace(("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left)));
- build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right)));
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt));
+ build_opts.add_option("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top));
+ build_opts.add_option("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom));
+ build_opts.add_option("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left));
+ build_opts.add_option("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right));
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
_tensor = tensor;
// Create static kernel arguments
@@ -141,8 +134,9 @@
set_constant_border<float>(idx, constant_border_value);
break;
case DataType::F16:
+ static_assert(sizeof(cl_half) == sizeof(half), "Half must be same size as cl_half");
static_assert(sizeof(cl_half) == 2, "Half must be 16 bit");
- set_constant_border<cl_half>(idx, constant_border_value);
+ set_constant_border<half>(idx, constant_border_value);
break;
default:
ARM_COMPUTE_ERROR("Not handled");
diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
index 5c38568..ef47d20 100644
--- a/src/core/CL/kernels/CLFlattenLayerKernel.cpp
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,6 +59,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index f333c1b..0857702 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,6 +54,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input, mult_interleave4x4_height, reinterpret_input_as_3d));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index b2fb3e0..1a1a4b7 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -71,11 +71,25 @@
}
else
{
- const int m = reshape_info.m();
- const int n = reshape_info.n();
- const int k = reshape_info.k();
- const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
- const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
+ const int m = reshape_info.m();
+ const int n = reshape_info.n();
+ const int k = reshape_info.k();
+ const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
+ const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+ const bool unroll_block = dot8_supported(CLKernelLibrary::get().get_device());
+
+ rhs_info.n0 = 16 / input1->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = !unroll_block;
TensorShape tensor_shape0{ input0->tensor_shape() };
tensor_shape0.set(0, k);
@@ -88,8 +102,8 @@
const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
- const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+ const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+ const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
new file mode 100644
index 0000000..e9be1a6
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
+ ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+
+ const int m = gemm_info.m();
+ const int n = gemm_info.n();
+ const int k = gemm_info.k();
+
+ TensorShape tensor_shape0{ input0->tensor_shape() };
+ tensor_shape0.set(0, k);
+ tensor_shape0.set(1, m);
+
+ TensorShape tensor_shape1{ input1->tensor_shape() };
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+ const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+ const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+ const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+{
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+
+ Window win{};
+ Window win_out{};
+ bool window_changed = false;
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
+
+ TensorInfo tmp_info(*output);
+
+ if(reinterpret_output_as_3d)
+ {
+ // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(output->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = rhs_info.n0;
+ num_elems_processed_per_iteration_y = lhs_info.m0;
+
+ // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+ // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+ const int m = gemm_info.m();
+ const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+ win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic input0_access(input0, 0, 0,
+ ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y),
+ input0->dimension(1));
+ AccessWindowStatic input1_access(input1, 0, 0,
+ ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+ input1->dimension(1));
+ AccessWindowStatic output_access(output, 0, 0,
+ ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+ output->dimension(1) + bottom_pad);
+
+ window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+ update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
+
+ output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+ _k = gemm_info.k();
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
+ _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+ build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
+ build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+ build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+ build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+
+ std::string kernel_name("gemmlowp_mm_reshaped_");
+ kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
+ kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+ kernel_name += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.v0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.h0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.interleave);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.interleave);
+}
+
+Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+ input1->clone().get(),
+ output->clone().get(),
+ lhs_info,
+ rhs_info,
+ gemm_info,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
+void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ if(_input1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ if(_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4;
+ const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if(!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input0, slice);
+ add_2D_tensor_argument(idx, _input1, slice_b);
+ add_2D_tensor_argument(idx, _output, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(window.slide_window_slice_3D(slice));
+}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 825d7fb..803ed30 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,14 +86,13 @@
_input = input;
_output = output;
- std::ostringstream ma_arguments;
- ma_arguments << "-DBETA=" << beta;
- std::set<std::string> build_opts;
- build_opts.emplace(ma_arguments.str());
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(beta));
// Create kernel
std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info());
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index c9ed776..2b004c2 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,15 +40,16 @@
#include <set>
#include <string>
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
namespace
{
using ElementsProcessed = Steps;
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
- bool fp_mixed_precision)
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float beta,
+ bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
@@ -60,17 +61,40 @@
ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool has_vec_c = input2 != nullptr && beta != 0.f;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(has_vec_c && !is_beta_one, "Adding input2 is only supported for beta equal to 1");
+
if(!is_interleaved_transposed)
{
ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+ if(has_vec_c)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->dimension(0) != input1->dimension(0), "Length of Vector C must match the number of columns of matrix B");
+ }
}
else
{
- const int m = reshape_info.m();
- const int n = reshape_info.n();
- const int k = reshape_info.k();
- const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
- const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
+ const int m = reshape_info.m();
+ const int n = reshape_info.n();
+ const int k = reshape_info.k();
+ const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
+ const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+ rhs_info.n0 = 16 / input1->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = true;
TensorShape tensor_shape0{ input0->tensor_shape() };
tensor_shape0.set(0, k);
@@ -83,11 +107,17 @@
const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
- const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
- const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+ const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+ const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+ if(has_vec_c)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor");
+ }
}
if(output->total_size() != 0)
@@ -100,10 +130,11 @@
return Status{};
}
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
+ float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
ElementsProcessed &num_elements_processed)
{
+ ARM_COMPUTE_UNUSED(beta);
bool window_changed = false;
Window win{};
Window win_out{};
@@ -113,6 +144,7 @@
unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
+ const bool has_vec_c = input2 != nullptr && beta != 0.f;
// In case both input and output have to be reinterpreted as 3D tensors,
// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -153,16 +185,21 @@
win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
- AccessWindowStatic input1_access(input1, 0, 0,
- ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
+ AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
+ AccessWindowStatic input1_access(input1, 0, 0,
+ ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+ ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
AccessWindowStatic output_access(output, 0, 0,
ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
output->dimension(1) + bottom_pad);
window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
+ if(has_vec_c)
+ {
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x);
+ window_changed = window_changed || update_window_and_padding(win, input2_access);
+ }
output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
}
@@ -196,6 +233,11 @@
window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
+ if(has_vec_c)
+ {
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x);
+ window_changed = window_changed || update_window_and_padding(win, input2_access);
+ }
Coordinates coord;
coord.set_num_dimensions(output->num_dimensions());
@@ -214,20 +256,22 @@
} // namespace
CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
+ : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _has_vec_c(false)
{
}
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
- bool fp_mixed_precision)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
+ bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
// Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, fp_mixed_precision));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
+ is_interleaved_transposed, reshape_info, fp_mixed_precision));
_input0 = input0;
_input1 = input1;
+ _input2 = input2;
_output = output;
_reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
_reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
@@ -253,7 +297,8 @@
ElementsProcessed num_elements_processed{};
// Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,
+ gpu_target, num_elements_processed);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
@@ -275,6 +320,8 @@
const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
+ _has_vec_c = input2 != nullptr && beta != 0.f;
+
std::string kernel_name;
if(is_interleaved_transposed)
{
@@ -338,6 +385,9 @@
build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
}
+ // Configure matrix C addition if necessary
+ build_opts.add_option_if(_has_vec_c, "-DADD_VEC_C");
+
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
@@ -360,16 +410,18 @@
_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
}
-Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
- const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)
+Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
+ bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)
{
// Note: num_elements_processed will be set in validate_and_configure_window()
ElementsProcessed num_elements_processed{};
ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info, fp_mixed_precision));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
input1->clone().get(),
+ (input2 != nullptr) ? input2->clone().get() : nullptr,
output->clone().get(),
+ beta,
is_interleaved_transposed,
reshape_info,
gpu_target,
@@ -396,10 +448,12 @@
slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+ const unsigned int num_arguments_vec_c = (_has_vec_c) ? num_arguments_per_1D_tensor() : 0;
+
if(_reinterpret_input_as_3d)
{
// Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_vec_c;
const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
}
@@ -407,7 +461,7 @@
if(_reinterpret_output_as_3d)
{
// Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_vec_c;
const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
}
@@ -425,6 +479,10 @@
unsigned int idx = 0;
add_2D_tensor_argument(idx, _input0, slice);
add_2D_tensor_argument(idx, _input1, slice_b);
+ if(_has_vec_c)
+ {
+ add_1D_tensor_argument(idx, _input2, slice);
+ }
add_2D_tensor_argument(idx, _output, slice);
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
@@ -433,3 +491,4 @@
}
while(window.slide_window_slice_3D(slice));
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
new file mode 100644
index 0000000..b6816ac
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
+ ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+
+ const int m = gemm_info.m();
+ const int n = gemm_info.n();
+ const int k = gemm_info.k();
+
+ TensorShape tensor_shape0{ input0->tensor_shape() };
+ tensor_shape0.set(0, k);
+ tensor_shape0.set(1, m);
+
+ TensorShape tensor_shape1{ input1->tensor_shape() };
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+ const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+ const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+ const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+{
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+
+ Window win{};
+ Window win_out{};
+ bool window_changed = false;
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
+
+ TensorInfo tmp_info(*output);
+
+ if(reinterpret_output_as_3d)
+ {
+ // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(output->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = rhs_info.n0;
+ num_elems_processed_per_iteration_y = lhs_info.m0;
+
+ // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+ // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+ const int m = gemm_info.m();
+ const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+ win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic input0_access(input0, 0, 0,
+ ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y),
+ input0->dimension(1));
+ AccessWindowStatic input1_access(input1, 0, 0,
+ ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+ input1->dimension(1));
+ AccessWindowStatic output_access(output, 0, 0,
+ ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+ output->dimension(1) + bottom_pad);
+
+ window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+ update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
+
+ output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+{
+}
+
+void CLGEMMMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), alpha, lhs_info, rhs_info, gemm_info));
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+ _k = gemm_info.k();
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
+ _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
+ build_opts.add_option_if(std::abs(1.0f - alpha) > 0.00001f, "-DALPHA=" + float_to_string_with_full_precision(alpha));
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+ build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
+ build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+ build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+ build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+
+ std::string kernel_name("gemm_mm_reshaped_");
+ kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
+ kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.v0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.h0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.interleave);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.interleave);
+}
+
+Status CLGEMMMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, alpha, lhs_info, rhs_info, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+ input1->clone().get(),
+ output->clone().get(),
+ lhs_info,
+ rhs_info,
+ gemm_info,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
+void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ if(_input1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ if(_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4;
+ const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if(!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input0, slice);
+ add_2D_tensor_argument(idx, _input1, slice_b);
+ add_2D_tensor_argument(idx, _output, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(window.slide_window_slice_3D(slice));
+}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
new file mode 100644
index 0000000..72f2ca4
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+ const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
+ const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
+ bool window_changed = false;
+
+ TensorInfo tmp_info(*input);
+
+ if(reinterpret_input_as_3d)
+ {
+ // Since the input tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(input->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d)));
+
+ // Configure window
+ // Note: bottom paddings are calculated manually as the input can be reinterpreted as 3D tensor
+ // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+ const int m = reinterpret_input_as_3d ? input->tensor_shape()[1] * input->tensor_shape()[2] : input->tensor_shape()[1];
+ const int bottom_pad = ceil_to_multiple(m, num_elems_processed_per_iteration_y) - m;
+
+ Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ Window win_in = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic input_access(input, 0, 0,
+ ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration_x),
+ input->dimension(1) + bottom_pad);
+ AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+
+ window_changed = update_window_and_padding(win_in, input_access) || // window used by the execute_window_loop
+ update_window_and_padding(win, output_access); // window used to update the padding requirements of output tensor
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win.collapse(win, Window::DimZ);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMReshapeLHSMatrixKernel::CLGEMMReshapeLHSMatrixKernel()
+ : _input(nullptr), _output(nullptr), _reinterpret_input_as_3d(false)
+{
+}
+
+void CLGEMMReshapeLHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), lhs_info, reinterpret_input_as_3d));
+
+ _input = input;
+ _output = output;
+ _reinterpret_input_as_3d = reinterpret_input_as_3d;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+ build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+ switch(input->info()->element_size())
+ {
+ case 1:
+ build_opts.add_option("-DDATA_TYPE=uchar");
+ break;
+ case 2:
+ build_opts.add_option("-DDATA_TYPE=ushort");
+ break;
+ case 4:
+ build_opts.add_option("-DDATA_TYPE=uint");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ }
+
+ std::string kernel_name("gemm_reshape_lhs_matrix_");
+ kernel_name += lhs_info.transpose ? "t" : "nt";
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), lhs_info, reinterpret_input_as_3d);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "gemm_reshape_lhs_matrix_";
+ _config_id += (_reinterpret_input_as_3d ? "3d_" : "");
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.v0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.interleave);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.transpose);
+}
+
+Status CLGEMMReshapeLHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, lhs_info, reinterpret_input_as_3d));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), lhs_info, reinterpret_input_as_3d).first);
+
+ return Status{};
+}
+
+void CLGEMMReshapeLHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+
+ if(_reinterpret_input_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 2 * num_arguments_per_3D_tensor();
+ const unsigned int total_cross_plane_pad = _input->info()->padding().top + _input->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(window.slide_window_slice_3D(slice));
+}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
new file mode 100644
index 0000000..5b9e68d
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_rhs_reshaped_shape(*input, rhs_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
+{
+ const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
+ const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
+ bool window_changed = false;
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*input, rhs_info)));
+
+ // Configure window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+
+ window_changed = update_window_and_padding(win, input_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win.collapse(win, Window::DimZ);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMReshapeRHSMatrixKernel::CLGEMMReshapeRHSMatrixKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMReshapeRHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), rhs_info));
+
+ _input = input;
+ _output = output;
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+ build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE");
+ build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+
+ switch(input->info()->element_size())
+ {
+ case 1:
+ build_opts.add_option("-DDATA_TYPE=uchar");
+ break;
+ case 2:
+ build_opts.add_option("-DDATA_TYPE=ushort");
+ break;
+ case 4:
+ build_opts.add_option("-DDATA_TYPE=uint");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ }
+
+ std::string kernel_name("gemm_reshape_rhs_matrix_");
+ kernel_name += rhs_info.transpose ? "t" : "nt";
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), rhs_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLGEMMReshapeRHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, rhs_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), rhs_info).first);
+
+ return Status{};
+}
+
+void CLGEMMReshapeRHSMatrixKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index aa1b92a..986a009 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,6 +55,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
new file mode 100644
index 0000000..412821b
--- /dev/null
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+ const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+ ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+ const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+ // Output auto initialization if not yet initialized
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ auto_init_if_empty((*output), output_shape, 1, input->data_type());
+
+ // Create window
+ Window win = calculate_max_window(*output, Steps());
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+
+} // namespace
+
+CLGatherKernel::CLGatherKernel()
+ : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+{
+}
+
+void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input = input;
+ _output = output;
+ _indices = indices;
+ _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions()));
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gather", build_opts.options()));
+ ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+ return Status{};
+}
+
+void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, window_collapsed);
+ add_1D_tensor_argument(idx, _indices, window_collapsed);
+ add_4D_tensor_argument(idx, _output, window_collapsed);
+ enqueue(queue, *this, window_collapsed);
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 5d100a4..ab95ddc 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,6 +42,7 @@
Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(anchors, all_anchors);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(anchors);
ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 54ef23f..8caa927 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,6 +74,7 @@
const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
@@ -192,11 +193,15 @@
num_elems_processed_per_iteration = 2;
is_padding_required_nchw = false;
- // Only the 3x3 case is optimized for NHWC
+ // Only the 3x3 and 9x9 cases are optimized for NHWC
if(kernel_dims == Size2D(3U, 3U))
{
kernel_name = "im2col3x3_";
}
+ else if(kernel_dims == Size2D(9U, 9U))
+ {
+ kernel_name = "im2col9x9_";
+ }
build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(input_channel - num_elems_processed_per_iteration), 0)));
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 97dd919..e33dab0 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
@@ -49,8 +50,9 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 2, "Axis greater than 2 is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
// Reduce shape on axis
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 67357da..9623ec6 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -37,20 +37,21 @@
namespace
{
+constexpr unsigned int num_elems_processed_per_iteration = 4;
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && norm_info.type() == NormType::IN_MAP_2D,
- "Only Cross-map and 1D In-map normalization is supported for NHWC layout");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
// Checks performed when output is configured
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
}
@@ -62,8 +63,6 @@
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
- const unsigned int num_elems_processed_per_iteration = 4;
-
const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
const bool is_norm_accross_width = norm_idx == 0;
@@ -118,15 +117,14 @@
_input = input;
_output = output;
- const unsigned int num_elems_processed_per_iteration = 4;
- const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
-
const DataLayout data_layout = input->info()->data_layout();
const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info);
_is_norm_across_width = norm_idx == 0;
const unsigned int border_width = _is_norm_across_width ? num_elems_processed_per_iteration - 1 : 0;
_border_size = BorderSize(0, border_width);
+ const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
+
// Set build options
CLBuildOptions build_opts;
build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
@@ -140,8 +138,24 @@
build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
// Create kernel
- std::string kernel_name = _is_norm_across_width ? "normalization_layer_in_map" : "normalization_layer_cross_map";
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ std::string kernel_name;
+ if(norm_info.is_in_map())
+ {
+ kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
+ }
+ else
+ {
+ if(data_layout == DataLayout::NCHW)
+ {
+ kernel_name = "normalization_layer_cross_map";
+ }
+ else
+ {
+ // 1D Cross-Map normalization in NHWC is the same as 1D In-Map normalization in NCHW
+ kernel_name = "normalization_layer_in_map_nchw";
+ }
+ }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index a44507b..9033016 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,6 +57,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index a9a2c5c..a5fc1a7 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,17 +56,22 @@
DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
- && (perm != PermutationVector{ 1U, 2U, 0U })
- && (perm != PermutationVector{ 3U, 2U, 0U, 1U }),
- "Only [2, 0, 1], [1, 2, 0] and [3, 2, 0, 1] permutation is supported");
- const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 1 || input->num_dimensions() > 4,
+ "Permutation upto 4-D input tensor is supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
+ "Permutation vector size should be less than or equal to 4");
+ for(const auto &p : perm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
+ }
// Validate configured output
if(output->total_size() != 0)
{
+ const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
return Status{};
@@ -87,30 +92,16 @@
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
// Create kernel
- std::set<std::string> build_opts;
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+ // New positions of width(W), height(H), channel(C) and batch(D) based on permutation vector
+ build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
+ build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1));
+ build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2));
+ build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3));
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
- // Run [2, 0, 1] permute
- if(_perm == PermutationVector{ 2U, 0U, 1U })
- {
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_201", build_opts));
- }
- // Run [1, 2, 0] permute
- else if(_perm == PermutationVector{ 1U, 2U, 0U })
- {
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_120", build_opts));
- }
- // Run [3, 2, 0, 1] permute
- else if(_perm == PermutationVector{ 3U, 2U, 0U, 1U })
- {
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_3201", build_opts));
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported.");
- }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute", build_opts.options()));
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index bd21ea0..7081688 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,6 +78,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type()));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
}
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 63e745e..c76d839 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -73,8 +73,7 @@
if(output != nullptr && output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
}
return Status{};
@@ -87,29 +86,11 @@
TensorShape output_shape = compute_prior_box_shape(*input1, info);
auto_init_if_empty(*output, output_shape, 1, input1->data_type());
- Window win{};
- bool window_changed = false;
-
- switch(input1->data_layout())
- {
- case DataLayout::NCHW:
- {
- const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
-
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, output_access);
- break;
- }
- case DataLayout::NHWC:
- {
- win = calculate_max_window(*output, Steps());
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- };
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, output_access);
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -188,25 +169,8 @@
}
}
- unsigned int idx = 0;
- // Create kernel
- switch(data_layout)
- {
- case DataLayout::NCHW:
- {
- idx = num_arguments_per_2D_tensor();
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nchw", build_opts.options()));
- break;
- }
- case DataLayout::NHWC:
- {
- idx = num_arguments_per_3D_tensor();
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nhwc", build_opts.options()));
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
+ unsigned int idx = num_arguments_per_2D_tensor();
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nchw", build_opts.options()));
_kernel.setArg(idx++, *_min);
_kernel.setArg(idx++, *_max);
@@ -245,31 +209,11 @@
queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
}
- switch(_input1->info()->data_layout())
- {
- case DataLayout::NCHW:
- {
- Window slice = window.first_slice_window_2D();
- slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
+ Window slice = window.first_slice_window_2D();
+ slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- break;
- }
- case DataLayout::NHWC:
- {
- Window slice = window.first_slice_window_3D();
- slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * _num_priors));
- slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
-
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 325eeb2..66d2623 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,6 +34,9 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute::misc::shape_calculator;
namespace arm_compute
{
@@ -47,18 +50,15 @@
ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
- ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
}
-
return Status{};
}
@@ -67,8 +67,9 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output auto inizialitation if not yet initialized
- TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
+ const TensorShape output_shape = compute_roi_align_shape(*input, *rois, pool_info);
auto_init_if_empty((*output), output_shape, 1, input->data_type());
+ output->set_data_layout(input->data_layout());
// Configure kernel window
const unsigned int num_elems_processed_per_iteration = 1;
@@ -107,12 +108,13 @@
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX)));
- build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY)));
- build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ)));
+ build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+ build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+ build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
+ build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
// Create kernel
@@ -137,7 +139,7 @@
Window slice_rois = slice;
// Parallelize spatially and across the fourth dimension of the output tensor (also across ROITensor)
slice_rois.set_dimension_step(Window::DimX, _rois->info()->dimension(0));
- slice.set(Window::DimZ, window[3]);
+ slice.set(get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL), window[3]);
// Set arguments
unsigned int idx = 0;
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 2367694..df7687e 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,29 +39,61 @@
#include <set>
#include <string>
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output auto initialization if not yet initialized
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
+ auto_init_if_empty((*output), output_shape, 1, input->data_type());
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = 1;
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
: _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
{
}
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+
+ //Validate arguments
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+ ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
+ ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
- ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
- // Output auto inizialitation if not yet initialized
- TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+ if(output->info()->total_size() != 0)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
+ }
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(rois->num_values() != output->info()->dimension(3));
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
// Set instance variables
_input = input;
@@ -89,19 +121,7 @@
add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
- // Configure kernel window
- const unsigned int num_elems_processed_per_iteration = 1;
- Window window = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input->info(),
- input->info()->valid_region().start(0),
- input->info()->valid_region().start(1),
- input->info()->valid_region().end(0),
- input->info()->valid_region().end(1));
- AccessWindowStatic output_access(output->info(), 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
-
- update_window_and_padding(window, input_access, output_access);
- output_access.set_valid_region(window, ValidRegion(Coordinates(), output->info()->tensor_shape()));
- ICLKernel::configure_internal(window);
+ ICLKernel::configure_internal(win_config.second);
}
void CLROIPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -109,14 +129,20 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- Window slice = window.first_slice_window_3D();
- // Parallelize spatially and across the fourth dimension of the output tensor (also across ROIArray)
+ Window slice = window.first_slice_window_3D();
+ Window slice_rois = slice;
+ // Parallelize spatially and across the fourth dimension of the output tensor (also across ROITensor)
+ slice_rois.set_dimension_step(Window::DimX, _rois->info()->dimension(0));
slice.set(Window::DimZ, window[3]);
// Set arguments
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- add_1D_array_argument<ROI>(idx, _rois, Strides(sizeof(ROI)), 1U, slice);
+ add_2D_tensor_argument(idx, _rois, slice_rois);
add_3D_tensor_argument(idx, _output, slice);
+ add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
+ add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+
enqueue(queue, *this, slice);
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
new file mode 100644
index 0000000..eb8822b
--- /dev/null
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+namespace
+{
+unsigned int get_num_elems_processed_per_iteration(const DataType dt)
+{
+ unsigned int num_elems_processed_per_iteration = preferred_vector_width(CLKernelLibrary::get().get_device(), dt);
+ if(num_elems_processed_per_iteration > 8)
+ {
+ num_elems_processed_per_iteration = 8; //kernel uses only 8 lanes.
+ }
+ return num_elems_processed_per_iteration;
+}
+
+Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output,
+ 1,
+ DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo &output, const float start, const float end, const float step)
+{
+ unsigned int num_elems_processed_per_iteration = get_num_elems_processed_per_iteration(output.data_type());
+ // Auto initialize output if not initialized
+ auto_init_if_empty(output, TensorShape(num_of_elements_in_range(start, end, step)), 1, output.data_type(), output.quantization_info());
+
+ // Configure kernel window
+ Window win = calculate_max_window(output, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), TensorShape(num_of_elements_in_range(start, end, step))));
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLRangeKernel::CLRangeKernel()
+ : _start(0), _end(1), _step(1), _output(nullptr)
+{
+}
+
+void CLRangeKernel::configure(ICLTensor *output, const float start, const float end, const float step)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(*(output->info()), start, end, step);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _start = start;
+ _end = end;
+ _step = step;
+ _output = output;
+
+ std::string kernel_name = "range";
+
+ unsigned int num_elems_processed_per_iteration = get_num_elems_processed_per_iteration(output->info()->data_type());
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
+ build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
+ if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+ {
+ build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+ kernel_name += "_quantized";
+ }
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(output->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+}
+
+Status CLRangeKernel::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*output, start, end, step));
+ ARM_COMPUTE_RETURN_ON_ERROR((validate_and_configure_window(*(output->clone()), start, end, step)).first);
+
+ return Status{};
+}
+
+void CLRangeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _output, window);
+
+ enqueue(queue, *this, window, lws_hint());
+}
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index ef46325..1f4cff3 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,7 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
@@ -45,6 +46,7 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
@@ -53,29 +55,41 @@
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, "Not supported operation for QASYMM8");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
}
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
// Output tensor auto initialization if not yet initialized
TensorShape output_shape{ input->tensor_shape() };
output_shape.set(axis, 1);
- auto_init_if_empty(*output, output_shape, 1, input->data_type());
+ const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+ DataType output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
+ auto_init_if_empty(*output, output_shape, 1, output_data_type, input->quantization_info());
const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
bool window_changed = false;
+ const bool is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->data_type()));
switch(axis)
{
case 0:
{
- if(is_data_type_quantized(input->data_type()))
+ if(is_serial_op)
{
AccessWindowHorizontal input_access(input, 0, input->dimension(0));
AccessWindowHorizontal output_access(output, 0, 1);
@@ -136,14 +150,17 @@
// Set build options
CLBuildOptions build_opts;
std::string data_type_promoted = get_cl_type_from_data_type(input->info()->data_type());
- if(is_data_type_quantized(input->info()->data_type()) && axis != 0)
+ if(is_data_type_quantized(input->info()->data_type()))
{
data_type_promoted = "uint";
}
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
- build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE=");
+ build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
+ build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
+ build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN");
+ build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
switch(op)
{
@@ -154,6 +171,12 @@
case ReductionOperation::MEAN_SUM:
build_opts.add_option(("-DOPERATION=sum"));
break;
+ case ReductionOperation::ARG_IDX_MAX:
+ case ReductionOperation::ARG_IDX_MIN:
+ break;
+ case ReductionOperation::PROD:
+ build_opts.add_option(("-DOPERATION=product"));
+ break;
default:
ARM_COMPUTE_ERROR("Unsupported reduction operation");
}
@@ -161,11 +184,18 @@
// Create kernel
cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
std::string kernel_axis_name;
+ const bool is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->info()->data_type()));
switch(axis)
{
case 0:
{
- if(!is_data_type_quantized(input->info()->data_type()))
+ if(is_serial_op)
+ {
+ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short");
+ kernel_axis_name = "non_parallel_x";
+ }
+ else
{
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
const unsigned int width_leftover = input->info()->dimension(0) % border_val;
@@ -178,11 +208,6 @@
lws_hint = cl::NDRange(std::min(8U, num_of_threads));
_border_size = BorderSize(0, border_width, 0, 0);
}
- else
- {
- build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- kernel_axis_name = "quantized_x";
- }
}
break;
case 1:
@@ -204,7 +229,7 @@
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation_" + kernel_axis_name, build_opts.options()));
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+ auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
@@ -214,7 +239,7 @@
Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op, width));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
return Status{};
}
@@ -224,39 +249,13 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+ const bool is_serial_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(_input->info()->data_type()));
switch(_reduction_axis)
{
case 0:
{
// We use parallel reduction only in non quantized types
- if(!is_data_type_quantized(_input->info()->data_type()))
- {
- // Set out window
- Window out_window(window);
- out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- // Get first input and output slices
- Window in_slice = window.first_slice_window_2D();
- Window out_slice = out_window.first_slice_window_2D();
-
- // Reshape window
- const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
- in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
-
- // Set local sums buffer
- unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size();
- _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
- }
- else
+ if(is_serial_op)
{
// Get first input and output slices
Window window_in{ window };
@@ -274,6 +273,33 @@
}
while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
}
+ else
+ {
+ // Set out window
+ Window out_window(window);
+ out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ // Get first input and output slices
+ Window in_slice = window.first_slice_window_2D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ // Reshape window
+ const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
+ in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
+
+ // Set local sums buffer
+ unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size();
+ _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, lws_hint());
+ }
+ while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ }
}
break;
case 1:
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
new file mode 100644
index 0000000..84bf5bf
--- /dev/null
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+CLReverseKernel::CLReverseKernel()
+ : _input(nullptr), _output(nullptr), _axis(nullptr)
+{
+}
+
+void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+
+ // Set kernel build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DNUM_REVERSE_DIMS=" + support::cpp11::to_string(axis->info()->dimension(0)));
+ switch(input->info()->element_size())
+ {
+ case 1:
+ build_opts.add_option("-DDATA_TYPE=uchar");
+ break;
+ case 2:
+ build_opts.add_option("-DDATA_TYPE=ushort");
+ break;
+ case 4:
+ build_opts.add_option("-DDATA_TYPE=uint");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reverse", build_opts.options()));
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_1D_tensor();
+ add_argument<cl_uint>(idx, input->info()->dimension(0));
+ add_argument<cl_uint>(idx, input->info()->dimension(1));
+ add_argument<cl_uint>(idx, input->info()->dimension(2));
+ add_argument<cl_uint>(idx, input->info()->dimension(3));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+ ICLKernel::configure_internal(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id += "reverse_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+}
+
+Status CLReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+ return Status{};
+}
+
+void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_4D();
+ Window axis_slice = collapsed.first_slice_window_1D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice);
+ add_1D_tensor_argument(idx, _axis, axis_slice);
+ add_4D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(collapsed.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index ce6c016..cd89d1c 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -65,6 +65,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(output == input);
float wr = 0.f;
@@ -94,14 +95,11 @@
num_elems_processed_per_iteration = 4;
// Configure kernel window
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- const ValidRegion &input_valid_region = input->valid_region();
-
- // Reads can occur within the valid region of the input
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
AccessWindowStatic input_access(input,
- input_valid_region.anchor[0] - border.left, input_valid_region.anchor[1] - border.top,
- input_valid_region.anchor[0] + input_valid_region.shape[0] + border.right,
- input_valid_region.anchor[1] + input_valid_region.shape[1] + border.bottom);
+ -border.left, -border.top,
+ input->dimension(0) + border.right,
+ input->dimension(1) + border.bottom);
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
output_access.set_valid_region(win, calculate_valid_region_scale(*(input),
@@ -118,7 +116,9 @@
num_elems_processed_per_iteration = 1;
// Configure kernel window
win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowRectangle input_access(input, -border.left, -border.top, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+ AccessWindowStatic input_access(input, -border.left, -border.top,
+ input->dimension(0) + border.right,
+ input->dimension(1) + border.bottom);
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
window_changed = update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -175,6 +175,7 @@
DataLayout data_layout = input->info()->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
// Compute the ratio between source width/height and destination width/height
const unsigned int input_width = input->info()->dimension(idx_width);
@@ -201,6 +202,7 @@
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
build_opts.add_option_if(border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+ build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
build_opts.add_option_if_else(sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
if(call_quantized_kernel)
{
@@ -215,7 +217,7 @@
kernel_name += lower_string(string_from_data_layout(data_layout));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
- unsigned int idx = data_layout == DataLayout::NHWC ? 2 * num_arguments_per_3D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
// Set static kernel arguments
const float scale_x = static_cast<float>(input_width) / output_width;
@@ -225,6 +227,20 @@
_kernel.setArg<float>(idx++, input_height);
_kernel.setArg<float>(idx++, scale_x);
_kernel.setArg<float>(idx++, scale_y);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "scale_";
+ _config_id += (border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
+ _config_id += (sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
+ _config_id += (is_nhwc ? "nhwc" : "nchw");
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(3));
}
void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -250,16 +266,13 @@
}
case DataLayout::NHWC:
{
- Window slice = window.first_slice_window_3D();
+ Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_4D();
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice);
+ add_4D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
break;
}
default:
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
new file mode 100644
index 0000000..c9e5da0
--- /dev/null
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(x);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(x,
+ 1,
+ DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, y);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, y);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::U8);
+
+ const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+
+ if(output != nullptr && output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *c, ITensorInfo *x, ITensorInfo *y, ITensorInfo *output)
+{
+ if(output != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, *x->clone());
+ }
+
+ const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
+
+ const unsigned int num_elems_processed_per_iteration = 16 / x->element_size();
+
+ // Configure kernel window
+ Window win = calculate_max_window(*x, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal x_access(x, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal y_access(y, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, x_access, y_access);
+
+ // Update window for condition
+ if(is_same_rank)
+ {
+ AccessWindowHorizontal c_access(c, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, c_access);
+ }
+
+ // Update window for output
+ if(output != nullptr)
+ {
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
+ output_access.set_valid_region(win, x->valid_region());
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLSelectKernel::CLSelectKernel()
+ : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+{
+}
+void CLSelectKernel::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
+
+ _c = c;
+ _x = x;
+ _y = y;
+ _output = output;
+ _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
+
+ const unsigned int num_elems_processed_per_iteration = 16 / x->info()->element_size();
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(x->info()->data_type()));
+ build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(x->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+
+ // Create kernel
+ std::string kernel_name = "select";
+ if(_has_same_rank)
+ {
+ kernel_name += "_same_rank";
+ }
+ else
+ {
+ const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
+ if(is_input_rank_greater_than_two)
+ {
+ const size_t width = x->info()->tensor_shape().x();
+ const size_t height = x->info()->tensor_shape().y();
+ const size_t outer_size = x->info()->tensor_shape()[x->info()->tensor_shape().num_dimensions() - 1];
+ const size_t depth_size = x->info()->tensor_shape().total_size() / (width * height * outer_size);
+ build_opts.add_option("-DDEPTH_SIZE=" + support::cpp11::to_string(depth_size));
+ }
+ kernel_name += "_different_rank";
+ kernel_name += is_input_rank_greater_than_two ? "_n" : "_2";
+ }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(c->info(), x->info(), y->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ _config_id = "select_";
+ _config_id += string_from_data_type(x->info()->data_type());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(x->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(x->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(x->info()->dimension(2));
+}
+
+Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(c->clone().get(), x->clone().get(), y->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ if(!_has_same_rank)
+ {
+ Window vector_slice = window.first_slice_window_1D();
+ vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _c, vector_slice);
+ }
+
+ do
+ {
+ unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
+ if(_has_same_rank)
+ {
+ add_3D_tensor_argument(idx, _c, slice);
+ }
+ add_3D_tensor_argument(idx, _x, slice);
+ add_3D_tensor_argument(idx, _y, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index d488631..f039198 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -39,10 +39,16 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(padddings->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(padddings->tensor_shape()[1] != block_info->tensor_shape()[0]);
// Validate output if initialized
if(output->total_size() != 0)
{
+ const DataLayout data_layout = input->data_layout();
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -64,8 +70,8 @@
const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] / block_shape_x != (output->tensor_shape()[idx_width] - padding_left.x() - padding_right.y()));
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] / block_shape_y != (output->tensor_shape()[idx_height] - padding_left.x() - padding_right.y()));
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_shape_x != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_shape_y != 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -101,6 +107,9 @@
build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+ build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+ build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+ build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
// Configure kernel window
@@ -132,6 +141,9 @@
build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+ build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+ build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+ build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
build_opts.add_option("-DPAD_LEFT_X=" + support::cpp11::to_string(padding_left.x()));
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
new file mode 100644
index 0000000..ccbe1fc
--- /dev/null
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+{
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input);
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLStackLayerKernel::CLStackLayerKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
+
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DAXIS=" + support::cpp11::to_string(axis));
+ build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DDST_DIM3=" + support::cpp11::to_string(output->info()->dimension(3)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("stack_layer", build_opts.options()));
+
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ const unsigned int idx = 2 * num_arguments_per_4D_tensor();
+ _kernel.setArg<cl_uint>(idx, idx_input);
+}
+
+Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+ return Status{};
+}
+
+void CLStackLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window window_out;
+ window_out.use_tensor_dimensions(_output->info()->tensor_shape());
+
+ Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+
+ Window slice_in = collapsed.first_slice_window_4D();
+ Window slice_out = window_out.first_slice_window_4D();
+
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+}
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 2d2ba10..c40f3c9 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -32,6 +32,7 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/bit_ops.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -69,7 +70,8 @@
// Checks output if configured
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape() != exp_output_shape);
+ const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -113,9 +115,11 @@
const TensorShape &input_shape = input->info()->tensor_shape();
- const Coordinates final_strides = arm_compute::helpers::tensor_transform::strided_slice_strides(input_shape, strides);
- const Coordinates starts_abs = arm_compute::helpers::tensor_transform::strided_slice_absolute_start_coords(input_shape, starts, final_strides, begin_mask);
- const Coordinates ends_abs = arm_compute::helpers::tensor_transform::strided_slice_absolute_end_coords(input_shape, starts_abs, ends, final_strides, end_mask, shrink_axis_mask);
+ Coordinates starts_abs, ends_abs, final_strides;
+ std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
+ input_shape,
+ starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
@@ -124,7 +128,8 @@
// Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
const int vec_size_x = 16 / input->info()->element_size();
const int output_width_x = output->info()->tensor_shape().x();
- const bool multi_access_x = (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
+ const bool is_shrink_on_x = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 0);
+ const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
// Update window if needed
if(multi_access_x)
@@ -140,8 +145,10 @@
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
+ const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+ build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
}
build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
new file mode 100644
index 0000000..7559e7a
--- /dev/null
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
+ {
+ return e == 0;
+ }));
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+CLTileKernel::CLTileKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Auto initialize output
+ TensorShape tiled_shape = misc::shape_calculator::compute_tiled_shape(input->info()->tensor_shape(), multiples);
+ auto_init_if_empty(*output->info(), tiled_shape, 1, input->info()->data_type());
+
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), multiples));
+
+ _input = input;
+ _output = output;
+
+ const DataType data_type = input->info()->data_type();
+ const int vec_size_x = 16 / input->info()->element_size();
+ const int input_width_x = input->info()->tensor_shape().x();
+ const unsigned int offset = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
+ const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width_x));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DSRC_BATCHES=" + support::cpp11::to_string(input->info()->dimension(3)));
+ build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("tile", build_opts.options()));
+
+ // Configure window without padding
+ Window win = calculate_max_window(*output->info());
+
+ if(multi_access_x)
+ {
+ // If multi-access is enabled, no thread should cross the tile boundaries. This means we need
+ // as many threads as those to cover a single tile times multiples[0]. Note that if threads
+ // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
+ // we don't need to pad the output
+ const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+ }
+
+ ICLKernel::configure_internal(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "tile";
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ for(unsigned int i = 0; i < multiples.size(); ++i)
+ {
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(i));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(multiples[i]);
+ }
+}
+
+Status CLTileKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, multiples));
+ return Status{};
+}
+
+void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_4D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice);
+ add_4D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(collapsed.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index ccf22ea..6c237a8 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,6 +69,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index ee3fa11..ce5ed86 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,7 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -49,12 +50,15 @@
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(upsampling_policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
return Status{};
}
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
index b0d27cb..d58cef5 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,9 +49,11 @@
{
// The window needs to be based on the output
Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
- AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
- input2->dimension(1));
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration), input1->dimension(1));
+ const unsigned int input2_right_padding = (output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1->dimension(
+ 0) + num_elems_processed_per_iteration - input2->dimension(0);
+ AccessWindowStatic input2_access(input2, -(input1->dimension(0) % num_elems_processed_per_iteration),
+ 0, input2->dimension(0) + input2_right_padding, input2->dimension(1));
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
@@ -109,6 +111,16 @@
build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+ if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info())
+ {
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+ build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
+ }
+
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x2", build_opts.options()));
@@ -118,6 +130,14 @@
ICLKernel::configure_internal(std::get<1>(win_config));
+ // Pass paddings as arguments to the kernel
+ const unsigned int input1_width = input1->info()->dimension(0);
+ const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
+ const unsigned int input2_left_padding = input1_width % num_elems_processed_per_iteration;
+ unsigned int idx0 = 3 * num_arguments_per_4D_tensor();
+ _kernel.setArg<cl_uint>(idx0++, input1_right_padding);
+ _kernel.setArg<cl_uint>(idx0++, input2_left_padding);
+
// Set config_id for enabling LWS tuning
_config_id = "concatenate_width_x2_";
_config_id += lower_string(string_from_data_type(input1->info()->data_type()));
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index 75aef9c..9cbb713 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,15 +47,29 @@
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output)
{
+ const unsigned int input1_width = input1->dimension(0);
+ const unsigned int input2_width = input2->dimension(0);
+ const unsigned int input3_width = input3->dimension(0);
+ const unsigned int input4_width = input4->dimension(0);
+
// The window needs to be based on the output
Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
- AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
- input2->dimension(1));
- AccessWindowStatic input3_access(input3, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input3->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
- input3->dimension(1));
- AccessWindowStatic input4_access(input4, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input4->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
- input4->dimension(1));
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1_width, num_elems_processed_per_iteration), input1->dimension(1));
+
+ const unsigned int input2_left_padding = input1_width % num_elems_processed_per_iteration;
+ const unsigned int input2_right_padding = ((input1_width + input2_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width + num_elems_processed_per_iteration -
+ input2_width;
+ AccessWindowStatic input2_access(input2, -input2_left_padding, 0, input2_width + input2_right_padding, input2->dimension(1));
+
+ const unsigned int input3_left_padding = (input1_width + input2_width) % num_elems_processed_per_iteration;
+ const unsigned int input3_right_padding = ((input1_width + input2_width + input3_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width - input2_width +
+ num_elems_processed_per_iteration - input3_width;
+ AccessWindowStatic input3_access(input3, -input3_left_padding, 0, input3_width + input3_right_padding, input3->dimension(1));
+
+ const unsigned int input4_left_padding = (input1_width + input2_width + input3_width) % num_elems_processed_per_iteration;
+ const unsigned int input4_right_padding = (output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration + num_elems_processed_per_iteration - output->dimension(0);
+ AccessWindowStatic input4_access(input4, -input4_left_padding, 0, input4_width + input4_right_padding, input4->dimension(1));
+
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, input1_access, input2_access, input3_access, input4_access, output_access);
@@ -119,6 +133,20 @@
build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+ if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info())
+ {
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+ build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
+ build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().scale));
+ build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().scale));
+ }
+
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x4", build_opts.options()));
@@ -128,6 +156,27 @@
ICLKernel::configure_internal(std::get<1>(win_config));
+ // Pass paddings as arguments to the kernel
+ const unsigned int input1_width = input1->info()->dimension(0);
+ const unsigned int input2_width = input2->info()->dimension(0);
+ const unsigned int input3_width = input3->info()->dimension(0);
+
+ const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
+ const unsigned int input2_left_padding = input1_width % num_elems_processed_per_iteration;
+ const unsigned int input2_right_padding = ((input1_width + input2_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width + num_elems_processed_per_iteration -
+ input2_width;
+ const unsigned int input3_left_padding = (input1_width + input2_width) % num_elems_processed_per_iteration;
+ const unsigned int input3_right_padding = ((input1_width + input2_width + input3_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width - input2_width +
+ num_elems_processed_per_iteration - input3_width;
+ const unsigned int input4_left_padding = (input1_width + input2_width + input3_width) % num_elems_processed_per_iteration;
+ unsigned int idx0 = 5 * num_arguments_per_4D_tensor();
+ _kernel.setArg<cl_uint>(idx0++, input1_right_padding);
+ _kernel.setArg<cl_uint>(idx0++, input2_left_padding);
+ _kernel.setArg<cl_uint>(idx0++, input2_right_padding);
+ _kernel.setArg<cl_uint>(idx0++, input3_left_padding);
+ _kernel.setArg<cl_uint>(idx0++, input3_right_padding);
+ _kernel.setArg<cl_uint>(idx0++, input4_left_padding);
+
// Set config_id for enabling LWS tuning
_config_id = "concatenate_width_x4_";
_config_id += lower_string(string_from_data_type(input1->info()->data_type()));
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index c51c579..6c32cd2 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -107,9 +107,16 @@
build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset));
build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+ {
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+ }
+
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width", build_opts.options()));
-
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), width_offset, output->info());
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 7f1afe0..84b5ea2 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -46,8 +46,18 @@
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
{
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
+ "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
+ }
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -133,14 +143,14 @@
{
}
-void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info)
+void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), winograd_info)));
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info, act_info));
_input = input;
_bias = bias;
@@ -161,6 +171,21 @@
// Set build options
CLBuildOptions build_opts;
+ build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+ build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+
+ if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
+ {
+ build_opts.add_option("-DVEC_SIZE=2");
+ }
+ else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
+ {
+ build_opts.add_option("-DVEC_SIZE=4");
+ }
+
+ build_opts.add_option_if(act_info.enabled(), "-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
+
build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS"));
build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
@@ -195,9 +220,9 @@
_config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
}
-Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info, act_info));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get(), winograd_info.output_tile_size).first);
return Status{};
diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.cpp b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
index 7d9dbd4..ee9bdec 100644
--- a/src/core/CL/kernels/CLYOLOLayerKernel.cpp
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,10 +43,10 @@
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
{
- ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(num_classes <= 0);
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 06a0551..02150ff 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 17eaec2..d9fe5b0 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp
@@ -58,17 +58,6 @@
return Status{};
}
-template <typename T>
-inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &perm)
-{
- const auto old_dim = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
- for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
- {
- T dimension_val = old_dim[i];
- dimensions.set(perm[i], dimension_val);
- }
-}
-
} // namespace
template <typename T>
diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
new file mode 100644
index 0000000..533543a
--- /dev/null
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T,
+ typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
+inline bool greater_than(T a, T b)
+{
+ const T epsilon = std::numeric_limits<T>::epsilon();
+ return (a - b > epsilon);
+}
+
+template < typename T,
+ typename std::enable_if < !utils::traits::is_floating_point<T>::value, int >::type = 0 >
+inline bool greater_than(T a, T b)
+{
+ return (a > b);
+}
+
+Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1));
+ // Validate configured output
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ }
+
+ return Status{};
+}
+} // namespace
+
+template <typename T>
+void CPPTopKVKernel::run_topkv()
+{
+ for(unsigned int i = 0; i < _batch_size; ++i)
+ {
+ const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{ i }));
+ const auto predicted_value = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ target_class_id, i }));
+
+ // The variable rank indicates how many values there are before the target_class_id
+ unsigned int rank = 0;
+ for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
+ {
+ const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ j, i }));
+ if(greater_than(current_prediction, predicted_value))
+ {
+ rank++;
+ }
+ }
+ *(_output->ptr_to_element(Coordinates{ i })) = static_cast<uint8_t>(rank < _k);
+ }
+}
+
+CPPTopKVKernel::CPPTopKVKernel()
+ : _predictions(nullptr), _targets(nullptr), _output(nullptr), _k(), _batch_size(), _num_classes()
+{
+}
+
+void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(predictions->info(), targets->info(), output->info(), k));
+ auto_init_if_empty(*output->info(), targets->info()->tensor_shape(), 1, DataType::U8);
+
+ _predictions = predictions;
+ _targets = targets;
+ _output = output;
+
+ _k = k;
+ _batch_size = predictions->info()->dimension(1);
+ _num_classes = predictions->info()->dimension(0);
+
+ ICPPKernel::configure(Window()); // Default 1 iteration window
+}
+
+Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k));
+ return Status{};
+}
+
+bool CPPTopKVKernel::is_parallelisable() const
+{
+ return false;
+}
+
+void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(window, info);
+ switch(_predictions->info()->data_type())
+ {
+ case DataType::F32:
+ run_topkv<float>();
+ break;
+ case DataType::F16:
+ run_topkv<half>();
+ break;
+ case DataType::S32:
+ run_topkv<int>();
+ break;
+ case DataType::QASYMM8:
+ run_topkv<uint8_t>();
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+}
+} // namespace arm_compute
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index 2f6a94b..e7b4365 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -54,9 +54,9 @@
va_start(args, msg);
auto err = create_error_va_list(ErrorCode::RUNTIME_ERROR, function, file, line, msg, args);
va_end(args);
- throw std::runtime_error(err.error_description());
+ ARM_COMPUTE_THROW(std::runtime_error(err.error_description()));
}
void Status::internal_throw_on_error() const
{
- throw std::runtime_error(_error_description);
+ ARM_COMPUTE_THROW(std::runtime_error(_error_description));
}
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 3dffcd0..e6c80e8 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -71,6 +71,7 @@
src_it, dst_it);
}
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
{
ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr);
@@ -151,6 +152,7 @@
}
}
}
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
bool ITensor::is_used() const
{
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 5ce79f1..b67396c 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,7 @@
#include "arm_compute/core/NEON/NEAsymm.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/QAsymm8.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -60,29 +61,21 @@
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
{
- constexpr unsigned int num_elems_processed_per_iteration = 16;
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- bool window_changed = false;
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps());
- if(output != nullptr && (output->total_size() != 0))
+ if(output != nullptr)
{
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
- window_changed = update_window_and_padding(win,
- AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration),
- output_access);
-
- output_access.set_valid_region(win, input->valid_region());
- }
- else
- {
- // In-place computation
- window_changed = update_window_and_padding(win,
- AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+ // NEActivationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
}
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(Status{}, win);
}
} // namespace
@@ -101,16 +94,15 @@
if(output != nullptr)
{
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), *input->info()->clone());
_output = output;
}
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
- ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU),
- "For QASYMM8 only relu and lower/upper bounded relu are supported");
+ ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+ && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
+ "For QASYMM8 only logistic, relu and lower/upper bounded relu are supported");
// Activation functions : FP32
static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
@@ -149,6 +141,8 @@
// Activation functions : QASYMM8
static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 =
{
+ { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qasymm8_t> },
+ { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qasymm8_t> },
{ ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t> },
{ ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qasymm8_t> },
};
@@ -176,337 +170,129 @@
ICPPKernel::configure(win_config.second);
}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
+NEActivationLayerKernel::activation(const Window &window)
{
- Iterator input(_input, window);
- Iterator output(_output, window);
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
- static const float16x8_t CONST_0 = vdupq_n_f16(0.f);
- static const float16x8_t CONST_1_H = vdupq_n_f16(1.f);
+ const int window_step_x = 16 / sizeof(T);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationFunction act = F;
- static const float32x4_t CONST_1_F32 = vdupq_n_f32(1.f);
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
- const float16x8_t a = vdupq_n_f16(_act_info.a());
- const float16x4_t a_h = vdup_n_f16(_act_info.a());
- const float16x8_t b = vdupq_n_f16(_act_info.b());
+ Iterator input(_input, win_collapsed);
+ Iterator output(_output, win_collapsed);
- execute_window_loop(window, [&](const Coordinates &)
+ const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+ const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ const auto va = wrapper::vdup_n(static_cast<T>(_act_info.a()), ExactTagType{});
+ const auto vb = wrapper::vdup_n(static_cast<T>(_act_info.b()), ExactTagType{});
+ const auto a = static_cast<T>(_act_info.a());
+ const auto b = static_cast<T>(_act_info.b());
+
+ execute_window_loop(win_collapsed, [&](const Coordinates & id)
{
- const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
- const float16x8x2_t in = vld2q_f16(input_ptr);
- float16x8x2_t tmp = { {} };
+ wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
- switch(F)
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
{
- case ActivationFunction::ABS:
- tmp =
- {
- {
- vabsq_f16(in.val[0]),
- vabsq_f16(in.val[1]),
- }
- };
- break;
- case ActivationFunction::BOUNDED_RELU:
- tmp =
- {
- {
- vminq_f16(a, vmaxq_f16(CONST_0, in.val[0])),
- vminq_f16(a, vmaxq_f16(CONST_0, in.val[1]))
- }
- };
- break;
- case ActivationFunction::LU_BOUNDED_RELU:
- tmp =
- {
- {
- vminq_f16(a, vmaxq_f16(b, in.val[0])),
- vminq_f16(a, vmaxq_f16(b, in.val[1]))
- }
- };
- break;
- case ActivationFunction::LINEAR:
- tmp =
- {
- {
- vaddq_f16(b, vmulq_f16(a, in.val[0])),
- vaddq_f16(b, vmulq_f16(a, in.val[1]))
- }
- };
- break;
- case ActivationFunction::LOGISTIC:
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ switch(act)
{
- tmp =
- {
- {
- vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[0])))),
- vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[1]))))
- }
- };
+ case ActivationFunction::ABS:
+ tmp = wrapper::vabs(vin);
+ break;
+ case ActivationFunction::LINEAR:
+ tmp = wrapper::vmla(vb, va, vin);
+ break;
+ case ActivationFunction::LOGISTIC:
+ tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+ break;
+ case ActivationFunction::RELU:
+ tmp = wrapper::vmax(const_0, vin);
+ break;
+ case ActivationFunction::BOUNDED_RELU:
+ tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+ break;
+ case ActivationFunction::LU_BOUNDED_RELU:
+ tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+ break;
+ case ActivationFunction::LEAKY_RELU:
+ tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+ break;
+ case ActivationFunction::SOFT_RELU:
+ tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
+ break;
+ case ActivationFunction::SQRT:
+ tmp = wrapper::vinv(wrapper::vinvsqrt(vin));
+ break;
+ case ActivationFunction::SQUARE:
+ tmp = wrapper::vmul(vin, vin);
+ break;
+ case ActivationFunction::TANH:
+ tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
}
- break;
- case ActivationFunction::RELU:
- tmp =
- {
- {
- vmaxq_f16(CONST_0, in.val[0]),
- vmaxq_f16(CONST_0, in.val[1])
- }
- };
- break;
- case ActivationFunction::LEAKY_RELU:
- tmp =
- {
- {
- vbslq_f16(vcgtq_f16(in.val[0], CONST_0), in.val[0], vmulq_f16(a, in.val[0])),
- vbslq_f16(vcgtq_f16(in.val[1], CONST_0), in.val[1], vmulq_f16(a, in.val[1]))
- }
- };
- break;
- case ActivationFunction::SOFT_RELU:
- {
- // TODO (COMPMID-1535) : Revisit FP16 approximations
- const float16x4x2_t in0 =
- {
- vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[0])))))),
- vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[0])))))),
- };
-
- const float16x4x2_t in1 =
- {
- vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[1])))))),
- vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[1])))))),
- };
-
- tmp =
- {
- {
- vcombine_f16(in0.val[0], in0.val[1]),
- vcombine_f16(in1.val[0], in1.val[1]),
- }
- };
- }
- break;
- case ActivationFunction::SQRT:
- tmp =
- {
- {
- vinvq_f16(vinvsqrtq_f16(in.val[0])),
- vinvq_f16(vinvsqrtq_f16(in.val[1])),
- }
- };
- break;
- case ActivationFunction::SQUARE:
- tmp =
- {
- {
- vmulq_f16(in.val[0], in.val[0]),
- vmulq_f16(in.val[1], in.val[1])
- }
- };
- break;
- case ActivationFunction::TANH:
- {
- // TODO (COMPMID-1535) : Revisit FP16 approximations
- const float16x8x2_t mul =
- {
- vmulq_f16(b, in.val[0]),
- vmulq_f16(b, in.val[1])
- };
- const float16x4x2_t in0 =
- {
- vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[0]))))),
- vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[0]))))),
- };
-
- const float16x4x2_t in1 =
- {
- vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[1]))))),
- vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[1]))))),
- };
-
- tmp =
- {
- {
- vcombine_f16(in0.val[0], in0.val[1]),
- vcombine_f16(in1.val[0], in1.val[1]),
- }
- };
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- break;
+ wrapper::vstore(output_ptr + x, tmp);
}
- vst2q_f16(output_ptr, tmp);
- },
- input, output);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
-{
- Iterator input(_input, window);
- Iterator output(_output, window);
-
- static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
- static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
- const float32x4_t a = vdupq_n_f32(_act_info.a());
- const float32x4_t b = vdupq_n_f32(_act_info.b());
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
- const float32x4x4_t in =
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
{
+ const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+ T tmp;
+ switch(act)
{
- vld1q_f32(input_ptr),
- vld1q_f32(input_ptr + 4),
- vld1q_f32(input_ptr + 8),
- vld1q_f32(input_ptr + 12)
+ case ActivationFunction::ABS:
+ tmp = std::abs(in);
+ break;
+ case ActivationFunction::LINEAR:
+ tmp = a * in + b;
+ break;
+ case ActivationFunction::LOGISTIC:
+ tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+ break;
+ case ActivationFunction::RELU:
+ tmp = std::max<T>(static_cast<T>(0), in);
+ break;
+ case ActivationFunction::BOUNDED_RELU:
+ tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
+ break;
+ case ActivationFunction::LU_BOUNDED_RELU:
+ tmp = std::min<T>(a, std::max<T>(b, in));
+ break;
+ case ActivationFunction::LEAKY_RELU:
+ tmp = (in > 0) ? in : a * in;
+ break;
+ case ActivationFunction::SOFT_RELU:
+ tmp = std::log(static_cast<T>(1) + std::exp(in));
+ break;
+ case ActivationFunction::SQRT:
+ tmp = std::sqrt(in);
+ break;
+ case ActivationFunction::SQUARE:
+ tmp = in * in;
+ break;
+ case ActivationFunction::TANH:
+ tmp = a * std::tanh(b * in);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
}
- };
- float32x4x4_t tmp = { {} };
-
- switch(F)
- {
- case ActivationFunction::ABS:
- tmp =
- {
- {
- vabsq_f32(in.val[0]),
- vabsq_f32(in.val[1]),
- vabsq_f32(in.val[2]),
- vabsq_f32(in.val[3]),
- }
- };
- break;
- case ActivationFunction::LINEAR:
- tmp =
- {
- {
- vmlaq_f32(b, a, in.val[0]),
- vmlaq_f32(b, a, in.val[1]),
- vmlaq_f32(b, a, in.val[2]),
- vmlaq_f32(b, a, in.val[3]),
- }
- };
- break;
- case ActivationFunction::LOGISTIC:
- tmp =
- {
- {
- vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[0])))),
- vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[1])))),
- vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[2])))),
- vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[3])))),
- }
- };
- break;
- case ActivationFunction::RELU:
- tmp =
- {
- {
- vmaxq_f32(CONST_0, in.val[0]),
- vmaxq_f32(CONST_0, in.val[1]),
- vmaxq_f32(CONST_0, in.val[2]),
- vmaxq_f32(CONST_0, in.val[3]),
- }
- };
- break;
- case ActivationFunction::BOUNDED_RELU:
- tmp =
- {
- {
- vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])),
- vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])),
- vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])),
- vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])),
- }
- };
- break;
- case ActivationFunction::LU_BOUNDED_RELU:
- tmp =
- {
- {
- vminq_f32(a, vmaxq_f32(b, in.val[0])),
- vminq_f32(a, vmaxq_f32(b, in.val[1])),
- vminq_f32(a, vmaxq_f32(b, in.val[2])),
- vminq_f32(a, vmaxq_f32(b, in.val[3])),
- }
- };
- break;
- case ActivationFunction::LEAKY_RELU:
- tmp =
- {
- {
- vbslq_f32(vcgtq_f32(in.val[0], CONST_0), in.val[0], vmulq_f32(a, in.val[0])),
- vbslq_f32(vcgtq_f32(in.val[1], CONST_0), in.val[1], vmulq_f32(a, in.val[1])),
- vbslq_f32(vcgtq_f32(in.val[2], CONST_0), in.val[2], vmulq_f32(a, in.val[2])),
- vbslq_f32(vcgtq_f32(in.val[3], CONST_0), in.val[3], vmulq_f32(a, in.val[3])),
- }
- };
- break;
- case ActivationFunction::SOFT_RELU:
- tmp =
- {
- {
- vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[0]))),
- vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[1]))),
- vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[2]))),
- vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[3]))),
- }
- };
- break;
- case ActivationFunction::SQRT:
- tmp =
- {
- {
- vinvq_f32(vinvsqrtq_f32(in.val[0])),
- vinvq_f32(vinvsqrtq_f32(in.val[1])),
- vinvq_f32(vinvsqrtq_f32(in.val[2])),
- vinvq_f32(vinvsqrtq_f32(in.val[3])),
- }
- };
- break;
- case ActivationFunction::SQUARE:
- tmp =
- {
- {
- vmulq_f32(in.val[0], in.val[0]),
- vmulq_f32(in.val[1], in.val[1]),
- vmulq_f32(in.val[2], in.val[2]),
- vmulq_f32(in.val[3], in.val[3]),
- }
- };
- break;
- case ActivationFunction::TANH:
- tmp =
- {
- {
- vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[0]))),
- vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[1]))),
- vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[2]))),
- vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[3]))),
- }
- };
- break;
- default:
- break;
+ *(output_ptr + x) = tmp;
}
-
- vst1q_f32(output_ptr, tmp.val[0]);
- vst1q_f32(output_ptr + 4, tmp.val[1]);
- vst1q_f32(output_ptr + 8, tmp.val[2]);
- vst1q_f32(output_ptr + 12, tmp.val[3]);
},
input, output);
}
@@ -514,13 +300,25 @@
template <ActivationLayerInfo::ActivationFunction F, typename T>
typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
{
- Iterator input(_input, window);
- Iterator output(_output, window);
- const QuantizationInfo qi_in = _input->info()->quantization_info();
- const QuantizationInfo qi_out = _output->info()->quantization_info();
- const qasymm8x16_t a = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
- const qasymm8x16_t b = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
- const qasymm8x16_t CONST_0 = vdupq_n_u8(sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset));
+ const int window_step_x = 16 / sizeof(T);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const ActivationFunction act = F;
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(_input, win_collapsed);
+ Iterator output(_output, win_collapsed);
+
+ const QuantizationInfo qi_in = _input->info()->quantization_info();
+ const QuantizationInfo qi_out = _output->info()->quantization_info();
+ const qasymm8x16_t va = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
+ const qasymm8x16_t vb = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
+ const qasymm8_t a = sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset);
+ const qasymm8_t b = sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset);
+ const qasymm8_t const_0 = sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset);
+ const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
// Initialise scale/offset for re-quantization
float s = qi_in.scale / qi_out.scale;
@@ -528,34 +326,116 @@
float32x4_t vs = vdupq_n_f32(s);
float32x4_t vo = vdupq_n_f32(o);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(win_collapsed, [&](const Coordinates & id)
{
- const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
- const qasymm8x16_t in = vld1q_u8(input_ptr);
- qasymm8x16_t tmp = {};
+ wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
- switch(F)
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
{
- case ActivationFunction::LU_BOUNDED_RELU:
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ if(act == ActivationFunction::RELU)
+ {
// Perform activation
- tmp = vminq_u8(a, vmaxq_u8(b, in));
+ tmp = vmaxq_u8(vconst_0, vin);
// Re-quantize to new output space
tmp = vmlaq_qasymm8(tmp, vs, vo);
- break;
- case ActivationFunction::RELU:
+ }
+ else if(act == ActivationFunction::BOUNDED_RELU)
+ {
// Perform activation
- tmp = vmaxq_u8(CONST_0, in);
+ tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
// Re-quantize to new output space
tmp = vmlaq_qasymm8(tmp, vs, vo);
- break;
- default:
- ARM_COMPUTE_ERROR("Function not implemented");
- break;
+ }
+ else if(act == ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8(tmp, vs, vo);
+ }
+ else if(act == ActivationFunction::LOGISTIC)
+ {
+ const auto scale_in = vdupq_n_f32(qi_in.scale);
+ const auto off_in = vdupq_n_f32(qi_in.offset);
+ const auto scale_out = vdupq_n_f32(qi_out.scale);
+ const auto off_out = vdupq_n_f32(qi_out.offset);
+ const auto vconst_1 = vdupq_n_f32(1.f);
+
+ const auto vin_low = wrapper::vgetlow(vin);
+ const auto vin_high = wrapper::vgethigh(vin);
+ uint16x8_t vin_low_u16x8 = wrapper::vmovl(vin_low);
+ uint16x8_t vin_high_u16x8 = wrapper::vmovl(vin_high);
+ // Convert uint16 vectors to uint32 vectors
+ uint32x4_t A_u32x4 = wrapper::vmovl(wrapper::vgetlow(vin_low_u16x8));
+ uint32x4_t B_u32x4 = wrapper::vmovl(wrapper::vgethigh(vin_low_u16x8));
+ uint32x4_t C_u32x4 = wrapper::vmovl(wrapper::vgetlow(vin_high_u16x8));
+ uint32x4_t D_u32x4 = wrapper::vmovl(wrapper::vgethigh(vin_high_u16x8));
+ // Convert uint32 vectors to float32 vectors
+ float32x4_t A_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(A_u32x4), off_in), scale_in);
+ float32x4_t B_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(B_u32x4), off_in), scale_in);
+ float32x4_t C_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(C_u32x4), off_in), scale_in);
+ float32x4_t D_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(D_u32x4), off_in), scale_in);
+ // Perform activation
+ A_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(A_f32x4))));
+ B_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(B_f32x4))));
+ C_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(C_f32x4))));
+ D_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(D_f32x4))));
+ // Convert float32 vectors to uint32 vectors
+ A_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(A_f32x4, scale_out), off_out));
+ B_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(B_f32x4, scale_out), off_out));
+ C_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(C_f32x4, scale_out), off_out));
+ D_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(D_f32x4, scale_out), off_out));
+ // Convert uint32 vectors to uint16 vectors (with saturation)
+ vin_low_u16x8 = wrapper::vcombine(wrapper::vqmovn(A_u32x4), wrapper::vqmovn(B_u32x4));
+ vin_high_u16x8 = wrapper::vcombine(wrapper::vqmovn(C_u32x4), wrapper::vqmovn(D_u32x4));
+ // convert uint16 vectors to uint8 vectors (with saturation)
+ tmp = wrapper::vcombine(wrapper::vqmovn(vin_low_u16x8), wrapper::vqmovn(vin_high_u16x8));
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ wrapper::vstore(output_ptr + x, tmp);
}
- vst1q_u8(output_ptr, tmp);
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ T in = *(reinterpret_cast<const T *>(input_ptr + x));
+ T tmp;
+ if(act == ActivationFunction::RELU)
+ {
+ tmp = std::max(const_0, in);
+ tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
+ }
+ else if(act == ActivationFunction::BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(const_0, in));
+ tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
+ }
+ else if(act == ActivationFunction::LU_BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(b, in));
+ tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
+ }
+ else if(act == ActivationFunction::LOGISTIC)
+ {
+ float tmp_f = scvt_f32_qasymm8(in, qi_in.scale, qi_in.offset);
+ tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+ tmp = sqcvt_qasymm8_f32(tmp_f, qi_out.scale, qi_out.offset);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ *(output_ptr + x) = tmp;
+ }
},
input, output);
}
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 169554f..ffa578f 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,7 @@
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
@@ -47,282 +48,426 @@
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+template <typename T, bool is_sat>
+void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
+ ARM_COMPUTE_UNUSED(policy);
- execute_window_loop(window, [&](const Coordinates & id)
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+ // Create input windows
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ constexpr int window_step_x = 16 / sizeof(T);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+ if(is_broadcast_across_x)
{
- vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
- },
- input1, input2, output);
-}
+ const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+ Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+ Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
+ // Clear X Dimension on execution window as we handle manually
+ non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- execute_window_loop(window, [&](const Coordinates & id)
- {
- vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
- },
- input1, input2, output);
-}
+ Iterator broadcast_input(broadcast_tensor, broadcast_win);
+ Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+ Iterator output(out, win);
-inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
-{
- const int16x8x2_t res =
- {
+ execute_window_loop(win, [&](const Coordinates & id)
{
- vaddq_s16(a.val[0], b.val[0]),
- vaddq_s16(a.val[1], b.val[1])
+ const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+ const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
+ const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+ const auto res = is_sat ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
+ wrapper::vstore(output_ptr + x, res);
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+ *(output_ptr + x) = is_sat ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
+ }
+ else
+ {
+ // Clear X Dimension on execution window as we handle manually
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto val1 = wrapper::vloadq(input1_ptr + x);
+ const auto val2 = wrapper::vloadq(input2_ptr + x);
+ const auto res = is_sat ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
+ wrapper::vstore(output_ptr + x, res);
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ const auto val1 = *(input1_ptr + x);
+ const auto val2 = *(input2_ptr + x);
+ *(output_ptr + x) = is_sat ? wrapper::add_sat(val1, val2) : val1 + val2;
+ }
+ },
+ input1, input2, output);
+ }
+}
+
+void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
+{
+ ARM_COMPUTE_UNUSED(policy);
+
+ // Create input windows
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+ const float output_scale = out->info()->quantization_info().scale;
+ const int output_offset = out->info()->quantization_info().offset;
+
+ const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale);
+ const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale);
+ const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale);
+ const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset);
+ const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset);
+ const float32x4_t voffseto = vdupq_n_f32(output_offset);
+
+ if(is_broadcast_across_x)
+ {
+ const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+ Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+ Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+ const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info();
+ const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info();
+
+ // Clear X Dimension on execution window as we handle manually
+ non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator broadcast_input(broadcast_tensor, broadcast_win);
+ Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+ const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
+
+ const float32x4x4_t bf =
+ {
+ {
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
+ }
+ };
+ const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
+ const float32x4x4_t af =
+ {
+ {
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
+ }
+ };
+
+ const int32x4x4_t rf =
+ {
+ {
+#ifdef __aarch64__
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#else //__aarch64__
+ vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#endif //__aarch64__
+ }
+ };
+
+ const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+ const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+ vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
+ *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
+ }
+ else
+ {
+ // Clear X Dimension on execution window as we handle manually
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const QuantizationInfo input1_qinfo = in1->info()->quantization_info();
+ const QuantizationInfo input2_qinfo = in2->info()->quantization_info();
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t a = vld1q_u8(input1_ptr + x);
+ const uint8x16_t b = vld1q_u8(input2_ptr + x);
+
+ const float32x4x4_t af =
+ {
+ {
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
+ }
+ };
+
+ const float32x4x4_t bf =
+ {
+ {
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
+ }
+ };
+
+ const int32x4x4_t rf =
+ {
+ {
+#ifdef __aarch64__
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#else //__aarch64__
+ vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#endif //__aarch64__
+ }
+ };
+
+ const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+ const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+ vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ const float afs = static_cast<int32_t>((*(input1_ptr + x)) - input1_qinfo.offset) * input1_qinfo.scale;
+ const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - input2_qinfo.offset) * input2_qinfo.scale;
+ *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
+ }
+ },
+ input1, input2, output);
+ }
+}
+
+void add_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
+{
+ // Create input windows
+ Window win = window;
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ const int window_step_x = 8;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+ if(policy == ConvertPolicy::WRAP)
+ {
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin1 = wrapper::vloadq(input1_ptr + x);
+ const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+ wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
+ }
}
- };
-
- return res;
-}
-
-inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b)
-{
- const float32x4x4_t res =
- {
+ else
{
- vaddq_f32(a.val[0], b.val[0]),
- vaddq_f32(a.val[1], b.val[1]),
- vaddq_f32(a.val[2], b.val[2]),
- vaddq_f32(a.val[3], b.val[3])
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin1 = wrapper::vloadq(input1_ptr + x);
+ const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+ wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
+ }
}
- };
-
- return res;
+ },
+ input1, input2, output);
}
-inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
+inline void add_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window)
{
- const int16x8x2_t res =
+ // Simply swap the two input buffers:
+ add_S16_U8_S16(input2, input1, output, policy, window);
+}
+
+void add_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
+{
+ // Create input windows
+ Window win = window;
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ const int window_step_x = 8;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ execute_window_loop(win, [&](const Coordinates & id)
{
+ const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+ if(policy == ConvertPolicy::WRAP)
{
- vqaddq_s16(a.val[0], b.val[0]),
- vqaddq_s16(a.val[1], b.val[1])
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
+ const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+ wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
+ }
}
- };
-
- return res;
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
-{
- const float16x8x2_t res =
- {
+ else
{
- vaddq_f16(a.val[0], b.val[0]),
- vaddq_f16(a.val[1], b.val[1])
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
+ const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+ wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
+ static_cast<int16_t>(*(input2_ptr + x)));
+ }
}
- };
-
- return res;
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
- const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
-
- vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b));
- },
- input1, input2, output);
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_UNUSED(in1);
- ARM_COMPUTE_UNUSED(in2);
- ARM_COMPUTE_UNUSED(out);
- ARM_COMPUTE_UNUSED(window);
- ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
-
-void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const float32x4x4_t a = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
- const float32x4x4_t b = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
-
- vst4q_f32(reinterpret_cast<float *>(output.ptr()), vadd4q_f32(a, b));
- },
- input1, input2, output);
-}
-
-void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
- const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
-
- vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vadd2q_s16(a, b));
- },
- input1, input2, output);
-}
-
-void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
- const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
-
- vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqadd2q_s16(a, b));
- },
- input1, input2, output);
-}
-
-void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int16x8x2_t a =
- {
- {
- vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
- vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
- }
- };
- const uint8x16_t b = vld1q_u8(input2.ptr());
-
- vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
- vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
- },
- input1, input2, output);
-}
-
-void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int16x8x2_t a =
- {
- {
- vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
- vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
- }
- };
- const uint8x16_t b = vld1q_u8(input2.ptr());
-
- vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
- vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
- },
- input1, input2, output);
-}
-
-inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
-{
- //Simply swap the two input buffers:
- add_wrap_S16_U8_S16(input2, input1, output, window);
-}
-
-inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
-{
- //Simply swap the two input buffers:
- add_saturate_S16_U8_S16(input2, input1, output, window);
-}
-
-void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const uint8x16_t a = vld1q_u8(input1.ptr());
- const uint8x16_t b = vld1q_u8(input2.ptr());
-
- const int16x8x2_t a_s16 =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
- }
- };
-
- const int16x8x2_t b_s16 =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
- }
- };
-
- vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0]));
- vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1]));
- },
- input1, input2, output);
-}
-
-void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const uint8x16_t a = vld1q_u8(input1.ptr());
- const uint8x16_t b = vld1q_u8(input2.ptr());
-
- const int16x8x2_t a_s16 =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
- }
- };
-
- const int16x8x2_t b_s16 =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
- }
- };
-
- vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0]));
- vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1]));
},
input1, input2, output);
}
@@ -332,12 +477,15 @@
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input1.tensor_shape().x() != input2.tensor_shape().x()) && ((input1.data_type() != input2.data_type()) || (input1.data_type() != output.data_type())
+ || (input2.data_type() != output.data_type())),
+ "Broadcasting across width is supported on configurations where all tensors have the same data type");
// Validate in case of configured output
if(output.total_size() > 0)
@@ -349,7 +497,8 @@
&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
&& !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
- && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
+ && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16)
+ && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8),
"You called addition with the wrong image formats");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -381,29 +530,26 @@
{
set_format_if_unknown(output, Format::F32);
}
+ else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
+ {
+ set_data_type_if_unknown(output, DataType::QASYMM8);
+ }
}
- Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
- Window win_input1 = win.broadcast_if_dimension_le_one(input1);
- Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+ Window win = calculate_max_window(valid_region, Steps());
- AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+ // NEArithmeticAdditionKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output.num_dimensions());
+ output.set_valid_region(valid_region);
- bool window_changed = update_window_and_padding(win_input1, input1_access)
- || update_window_and_padding(win_input2, input2_access)
- || update_window_and_padding(win, output_access);
-
- output_access.set_valid_region(win, valid_region);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(Status{}, win);
+ ;
}
} // namespace
NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
- : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+ : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy()
{
}
@@ -418,25 +564,30 @@
static std::map<std::string, AddFunction *> map_function =
{
- { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
- { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
- { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
- { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 },
- { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 },
- { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
- { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
- { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
- { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
- { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
- { "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
- { "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
- { "add_wrap_F16_F16_F16", &add_F16_F16_F16 },
- { "add_saturate_F16_F16_F16", &add_F16_F16_F16 },
+ { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
+ { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
+ { "add_wrap_U8_U8_U8", &add_same<uint8_t, false> },
+ { "add_saturate_U8_U8_U8", &add_same<uint8_t, true> },
+ { "add_wrap_S16_U8_S16", &add_S16_U8_S16 },
+ { "add_saturate_S16_U8_S16", &add_S16_U8_S16 },
+ { "add_wrap_U8_S16_S16", &add_U8_S16_S16 },
+ { "add_saturate_U8_S16_S16", &add_U8_S16_S16 },
+ { "add_wrap_U8_U8_S16", &add_U8_U8_S16 },
+ { "add_saturate_U8_U8_S16", &add_U8_U8_S16 },
+ { "add_wrap_S16_S16_S16", &add_same<int16_t, false> },
+ { "add_saturate_S16_S16_S16", &add_same<int16_t, true> },
+ { "add_wrap_F32_F32_F32", &add_same<float, false> },
+ { "add_saturate_F32_F32_F32", &add_same<float, false> },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ { "add_wrap_F16_F16_F16", &add_same<float16_t, false> },
+ { "add_saturate_F16_F16_F16", &add_same<float16_t, false> },
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
};
_input1 = input1;
_input2 = input2;
_output = output;
+ _policy = policy;
std::string function_to_call("add_");
function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
@@ -471,12 +622,5 @@
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(_input1, _input2, _output, window);
-}
-
-BorderSize NEArithmeticAdditionKernel::border_size() const
-{
- const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
- const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
+ (*_func)(_input1, _input2, _output, _policy, window);
}
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index c1e3e1f..ed83286 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -42,10 +42,10 @@
namespace
{
-template <typename T, int S>
+template <typename T>
inline void bitwise_and(const T *__restrict input1, const T *__restrict input2, T *__restrict output)
{
- using type = typename wrapper::traits::neon_vector<T, S>::type;
+ using type = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::type;
const type val1 = vloadq(static_cast<const T *>(input1));
const type val2 = vloadq(static_cast<const T *>(input2));
@@ -108,7 +108,7 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- bitwise_and<uint8_t, 16>(input1.ptr(), input2.ptr(), output.ptr());
+ bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
},
input1, input2, output);
}
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index f8217d3..b2b0dbd 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,7 +38,7 @@
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input,
1,
DataType::U8, DataType::S8, DataType::QASYMM8,
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index d6517ac..e3661ee 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,6 +53,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, false));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 8c875cd..8352c94 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/TensorInfo.h"
@@ -57,14 +58,30 @@
Iterator input(in, window);
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ const DataType dt = in->info()->data_type();
+ const QuantizationInfo &input_qinfo = in->info()->quantization_info();
+ const QuantizationInfo &output_qinfo = out->info()->quantization_info();
+ if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo)
{
- const auto in_ptr = reinterpret_cast<const T *>(input_ptr + input.offset());
- const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset());
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(input_ptr + input.offset());
+ const auto out_ptr = reinterpret_cast<uint8_t *>(output_ptr + output.offset());
+ vst1q_u8(out_ptr, vquantize(vdequantize(vld1q_u8(in_ptr), input_qinfo), output_qinfo));
+ },
+ input, output);
+ }
+ else
+ {
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ const auto in_ptr = reinterpret_cast<const T *>(input_ptr + input.offset());
+ const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset());
- wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
- },
- input, output);
+ wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
+ },
+ input, output);
+ }
}
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 158f401..5433755 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
@@ -43,10 +44,13 @@
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON(input == output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
+ "Only data_types supported [in] QASYMM8 -> [out] F16, F32");
+
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
&& output->data_type() != DataType::S32),
"Only data_types supported [in] U8 -> [out] U16, S16, S32");
@@ -57,11 +61,11 @@
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
"Only data_types supported [in] S16 -> [out] U8, S32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && output->data_type() != DataType::F32,
- "Only data_types supported [in] F16 -> [out] F32");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F32),
+ "Only data_types supported [in] F16 -> [out] QASYMM8, F32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && output->data_type() != DataType::F16,
- "Only data_types supported [in] F32 -> [out] F16");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16),
+ "Only data_types supported [in] F32 -> [out] QASYMM8, F16");
// Validate in case of configured output
if(output->total_size() > 0)
@@ -134,6 +138,75 @@
switch(_input->info()->data_type())
{
+ case DataType::QASYMM8:
+ {
+ switch(_output->info()->data_type())
+ {
+ /* Up-conversion QASYMM8 -> F32 */
+ case DataType::F32:
+ {
+ const float32x4_t scale = vdupq_n_f32(_input->info()->quantization_info().scale);
+ const int32x4_t offset = vdupq_n_s32(_input->info()->quantization_info().offset);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+ const uint16x8x2_t texels_u16 =
+ {
+ {
+ vmovl_u8(vget_low_u8(texels_u8)),
+ vmovl_u8(vget_high_u8(texels_u8))
+ }
+ };
+
+ const int32x4x4_t texels_s32 =
+ {
+ {
+ vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[0]))),
+ vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[0]))),
+ vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[1]))),
+ vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[1])))
+ }
+ };
+
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[0], offset)), scale));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[1], offset)), scale));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[2], offset)), scale));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[3], offset)), scale));
+ },
+ input, output);
+ break;
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ /* Up-conversion QASYMM8 -> F16 */
+ case DataType::F16:
+ {
+ const float16x8_t scale = vdupq_n_f16(static_cast<float16_t>(_input->info()->quantization_info().scale));
+ const int16x8_t offset = vdupq_n_s16(static_cast<int16_t>(_input->info()->quantization_info().offset));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+ const int16x8x2_t texels_s16 =
+ {
+ {
+ vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+ vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
+ }
+ };
+
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[0], offset)), scale));
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[1], offset)), scale));
+ },
+ input, output);
+ break;
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+ break;
+ }
case DataType::U8:
{
const int16x8_t b = vdupq_n_s16(_shift);
@@ -367,6 +440,31 @@
case DataType::F16:
switch(_output->info()->data_type())
{
+ case DataType::QASYMM8:
+ {
+ const float16x8_t scale = vinvq_f16(vdupq_n_f16(static_cast<float16_t>(_output->info()->quantization_info().scale)));
+ const int16x8_t offset = vdupq_n_s16(static_cast<int16_t>(_output->info()->quantization_info().offset));
+ const int16x8_t max_val_vec = vdupq_n_s16(255);
+ const int16x8_t zero_val_vec = vdupq_n_s16(0);
+
+ /* Down-conversion F16 -> QASYMM8 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float16x8x2_t texels =
+ {
+ {
+ vmulq_f16(vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())), scale),
+ vmulq_f16(vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8), scale),
+ }
+ };
+
+ const auto texel_quantized_0 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[0]), offset), max_val_vec), zero_val_vec);
+ const auto texel_quantized_1 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[1]), offset), max_val_vec), zero_val_vec);
+ vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(vqmovun_s16(texel_quantized_0), vqmovun_s16(texel_quantized_1)));
+ },
+ input, output);
+ break;
+ }
case DataType::F32:
{
const float32x4_t scale = vdupq_n_f32(1 << _shift);
@@ -394,9 +492,44 @@
ARM_COMPUTE_ERROR("Output data type not supported");
}
break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
switch(_output->info()->data_type())
{
+ case DataType::QASYMM8:
+ {
+ const float32x4_t scale = vinvq_f32(vdupq_n_f32(_output->info()->quantization_info().scale));
+ const int32x4_t offset = vdupq_n_s32(_output->info()->quantization_info().offset);
+ const int32x4_t max_val_vec = vdupq_n_s32(255);
+ const int32x4_t zero_val_vec = vdupq_n_s32(0);
+
+ /* Down-conversion F32 -> QASYMM8 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float32x4x4_t texels =
+ {
+ {
+ vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
+ vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
+ vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
+ vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale)
+ }
+ };
+
+ const auto texel_quantized_0 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[0]), offset), max_val_vec), zero_val_vec);
+ const auto texel_quantized_1 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[1]), offset), max_val_vec), zero_val_vec);
+ const auto texel_quantized_2 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[2]), offset), max_val_vec), zero_val_vec);
+ const auto texel_quantized_3 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[3]), offset), max_val_vec), zero_val_vec);
+
+ const auto converted_0 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_0), vqmovun_s32(texel_quantized_1)));
+ const auto converted_1 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_2), vqmovun_s32(texel_quantized_3)));
+
+ vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(converted_0, converted_1));
+ },
+ input, output);
+ break;
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
@@ -420,11 +553,11 @@
input, output);
break;
}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
ARM_COMPUTE_ERROR("Output data type not supported");
}
break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
ARM_COMPUTE_ERROR("Not supported");
}
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 99bdb7a..6071153 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -146,6 +147,7 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -347,6 +349,7 @@
void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
{
+ ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(_input);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index e8fb8cd..62373e3 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,6 +47,7 @@
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
return Status{};
}
diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 921582a..37269ca 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,6 +51,7 @@
TensorShape output_shape = compute_vector_to_tensor_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 77ab5ad..b0e1fcb 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,6 +91,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != input->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index a571d54..09836f1 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -431,21 +431,13 @@
uint8x16_t min = vdupq_n_u8(0);
uint8x16_t max = vdupq_n_u8(255);
- Window window_bias = window;
- window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
- window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
- window_bias.set(3, Window::Dimension(0, 0, 0));
-
Iterator in(input, window);
- Iterator bi(bias, window_bias);
-
Iterator out(output, window);
execute_window_loop(window, [&](const Coordinates & id)
{
- // Get bias and pointer to input
+ // Get pointer to input
const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
- // Accumulate bias
int32x4x4_t v_in =
{
{
@@ -459,7 +451,7 @@
const auto out_ptr = out.ptr();
vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
},
- in, bi, out);
+ in, out);
}
} // namespace
@@ -498,6 +490,8 @@
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
+ const bool has_bias = bias != nullptr;
+
// Set appropriate function
if(input->info()->data_layout() == DataLayout::NCHW)
{
@@ -511,13 +505,27 @@
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
- _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
+ if(has_bias)
+ {
+ _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
+ }
+ else
+ {
+ _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, false> : &output_stage_nchw<float16_t, float16_t, false, false>;
+ }
break;
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
- _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
+ if(has_bias)
+ {
+ _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
+ }
+ else
+ {
+ _func = (output == nullptr) ? &output_stage_nchw<float, float, true, false> : &output_stage_nchw<float, float, false, false>;
+ }
break;
}
default:
@@ -532,19 +540,33 @@
{
case DataType::S32:
{
- _func = (output == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
+ _func = (bias == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
break;
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
- _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, true> : &output_stage_nhwc<float16_t, float16_t, false, true>;
+ if(has_bias)
+ {
+ _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, true> : &output_stage_nhwc<float16_t, float16_t, false, true>;
+ }
+ else
+ {
+ _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, false> : &output_stage_nhwc<float16_t, float16_t, false, false>;
+ }
break;
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
- _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;
+ if(has_bias)
+ {
+ _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;
+ }
+ else
+ {
+ _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, false> : &output_stage_nhwc<float, float, false, false>;
+ }
break;
}
default:
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
new file mode 100644
index 0000000..aa458c2
--- /dev/null
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
@@ -0,0 +1,930 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+
+namespace
+{
+float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
+{
+ qasymm8x16_t x = vld1q_u8(input1_ptr);
+ const float32x4x4_t out =
+ {
+ {
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
+ }
+ };
+ return out;
+}
+
+void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
+{
+ const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));
+ const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));
+ vst1q_u8(output_ptr, vcombine_u8(pa, pb));
+}
+
+void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
+{
+ const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
+ const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
+ vst1q_u8(output_ptr, vcombine_u8(pa, pb));
+}
+
+void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+{
+ int32x4x4_t out =
+ {
+ {
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+ }
+ };
+ store_quantized(output_ptr, out);
+}
+
+float32x4x4_t dup_quantized(qasymm8_t broadcast_value, int offset, float scale)
+{
+ const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
+ const int32x4_t voffset = vdupq_n_s32(offset);
+ const float32x4_t vscale = vdupq_n_f32(scale);
+
+ const float32x4x4_t broadcast_vector =
+ {
+ {
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset)), vscale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset)), vscale),
+ }
+ };
+ return broadcast_vector;
+}
+
+template <ArithmeticOperation op, typename ScalarType>
+inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+ auto res = ScalarType(0);
+
+ switch(op)
+ {
+ case ArithmeticOperation::MAX:
+ res = std::max(a, b);
+ break;
+ case ArithmeticOperation::MIN:
+ res = std::min(a, b);
+ break;
+ case ArithmeticOperation::SQUARED_DIFF:
+ {
+ res = (a - b) * (a - b);
+ break;
+ }
+ case ArithmeticOperation::DIV:
+ {
+ res = a / b;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+ return res;
+}
+
+template <ArithmeticOperation op>
+inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, QuantizationInfo qinfo)
+{
+ return qinfo.quantize(elementwise_arithm_op_scalar<op>(a, b), RoundingPolicy::TO_NEAREST_UP);
+}
+
+template <ArithmeticOperation op, typename VectorType>
+inline VectorType elementwise_arithm_op(const VectorType &a, const VectorType &b)
+{
+ VectorType res = { 0, 0, 0, 0 };
+
+ switch(op)
+ {
+ case ArithmeticOperation::MAX:
+ res = wrapper::vmax(a, b);
+ break;
+ case ArithmeticOperation::MIN:
+ res = wrapper::vmin(a, b);
+ break;
+ case ArithmeticOperation::SQUARED_DIFF:
+ {
+ const VectorType tmp = wrapper::vsub(a, b);
+ res = wrapper::vmul(tmp, tmp);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+
+ return res;
+}
+
+template <>
+inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, float32x4_t>(const float32x4_t &a, const float32x4_t &b)
+{
+ return wrapper::vdiv(a, b);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, float16x8_t>(const float16x8_t &a, const float16x8_t &b)
+{
+ return wrapper::vdiv(a, b);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <ArithmeticOperation op>
+inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+ float32x4x4_t out =
+ {
+ {
+ elementwise_arithm_op<op>(a.val[0], b.val[0]),
+ elementwise_arithm_op<op>(a.val[1], b.val[1]),
+ elementwise_arithm_op<op>(a.val[2], b.val[2]),
+ elementwise_arithm_op<op>(a.val[3], b.val[3]),
+ }
+ };
+ return out;
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline VectorType elementwise_arithm_op_broadcast(const VectorType &a, const ScalarType &broadcast_value, const bool reorder)
+{
+ VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+ return elementwise_arithm_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <ComparisonOperation op, typename InputScalarType>
+inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
+{
+ bool res = false;
+
+ switch(op)
+ {
+ case ComparisonOperation::Equal:
+ res = (a == b);
+ break;
+ case ComparisonOperation::NotEqual:
+ res = (a != b);
+ break;
+ case ComparisonOperation::Greater:
+ res = (a > b);
+ break;
+ case ComparisonOperation::GreaterEqual:
+ res = (a >= b);
+ break;
+ case ComparisonOperation::Less:
+ res = (a < b);
+ break;
+ case ComparisonOperation::LessEqual:
+ res = (a <= b);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+ return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
+}
+
+template <ComparisonOperation op>
+inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, QuantizationInfo qinfo)
+{
+ ARM_COMPUTE_UNUSED(qinfo);
+ return elementwise_comp_op_scalar<op>(a, b);
+}
+
+template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
+{
+ OutputVectorType res = { 0, 0, 0, 0 };
+
+ switch(op)
+ {
+ case ComparisonOperation::Equal:
+ res = wrapper::vceq(a, b);
+ break;
+ case ComparisonOperation::NotEqual:
+ res = wrapper::vnot(wrapper::vceq(a, b));
+ break;
+ case ComparisonOperation::Greater:
+ res = wrapper::vcgt(a, b);
+ break;
+ case ComparisonOperation::GreaterEqual:
+ res = wrapper::vcge(a, b);
+ break;
+ case ComparisonOperation::Less:
+ res = wrapper::vcgt(b, a);
+ break;
+ case ComparisonOperation::LessEqual:
+ res = wrapper::vcge(b, a);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+
+ return res;
+}
+
+template <ComparisonOperation op>
+inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+ uint32x4x4_t out =
+ {
+ {
+ elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
+ elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
+ elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
+ elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
+ }
+ };
+ return out;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
+{
+ InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+ return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
+ const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ wrapper::vstore(output_ptr + x, elementwise_arithm_op<op>(a, b));
+ }
+ return x;
+}
+
+template <ArithmeticOperation op>
+inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
+ const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
+ int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
+ float32x4_t voffseto, float32x4_t invvscaleo)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ // Get inputs and compute output
+ const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+ const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+ const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
+ store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+ }
+ return x;
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
+ const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+ wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op>(a, broadcast_value, reorder));
+ }
+ return x;
+}
+
+template <ArithmeticOperation op>
+inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
+ const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
+ int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
+ float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+ const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+ store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
+ wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ auto a = wrapper::vloadq(input1_ptr + x);
+ auto b = wrapper::vloadq(input2_ptr + x);
+ const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+ a = wrapper::vloadq(input1_ptr + x + 4);
+ b = wrapper::vloadq(input2_ptr + x + 4);
+ const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+ wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
+ }
+ if(x <= window_end_x - 4)
+ {
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+ for(int i = 0; i < 4; i++)
+ {
+ *(output_ptr + x + i) = wrapper::vgetlane(res, i);
+ }
+ x = +4;
+ }
+ return x;
+}
+
+template <ComparisonOperation op>
+inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
+ const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
+ int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
+ float32x4_t voffseto, float32x4_t invvscaleo)
+{
+ ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+ const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+ const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);
+ store_quantized(output_ptr + x, rf);
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+ wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
+ const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
+ wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
+ }
+ if(x <= window_end_x - 4)
+ {
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+ for(int i = 0; i < 4; i++)
+ {
+ *(output_ptr + x + i) = wrapper::vgetlane(a, i);
+ }
+ x = +4;
+ }
+ return x;
+}
+
+template <ComparisonOperation op>
+inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
+ const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
+ int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
+ float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+{
+ ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+ const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+ store_quantized(output_ptr + x, rf);
+ }
+ return x;
+}
+
+template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+ int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
+ int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
+{
+ // Create input windows
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+ if(is_broadcast_across_x)
+ {
+ const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+ Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+ Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+ // Clear X Dimension on execution window as we handle manually
+ non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator broadcast_input(broadcast_tensor, broadcast_win);
+ Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+ const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+ int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
+ for(; x < window_end_x; ++x)
+ {
+ const auto a = *(non_broadcast_input_ptr + x);
+ *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
+ }
+ else
+ {
+ // Clear X Dimension on execution window as we handle manually
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+ int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
+ for(; x < window_end_x; ++x)
+ {
+ const auto a = *(input1_ptr + x);
+ const auto b = *(input2_ptr + x);
+ *(output_ptr + x) = (*scalar_func)(a, b);
+ }
+ },
+ input1, input2, output);
+ }
+}
+
+void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
+ int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
+ float32x4_t, float32x4_t, const bool),
+ int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
+ int32x4_t, int32x4_t, float32x4_t, float32x4_t,
+ float32x4_t, float32x4_t))
+{
+ // Create input windows
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+ const float output_scale = out->info()->quantization_info().scale;
+ const int output_offset = out->info()->quantization_info().offset;
+
+ // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
+ const float32x4_t voffseto = vdupq_n_f32(output_offset + 0.5f);
+ const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale);
+
+ if(is_broadcast_across_x)
+ {
+ // Select the broadcast input on the X axis
+ const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+ Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+ Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+ const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info();
+ const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info();
+
+ const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
+ const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
+
+ // Clear X Dimension on execution window as we handle manually
+ non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator broadcast_input(broadcast_tensor, broadcast_win);
+ Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+ const float32x4x4_t broadcast_vector = dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale);
+
+ int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
+ voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
+ for(; x < window_end_x; ++x)
+ {
+ const float afs = scvt_f32_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo.scale, non_broadcast_qinfo.offset);
+ const float bfs = scvt_f32_qasymm8(broadcast_value, broadcast_qinfo.scale, broadcast_qinfo.offset);
+ *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs,
+ out->info()->quantization_info());
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
+ }
+ else
+ {
+ // Input1 quantization info
+ const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset);
+ const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale);
+
+ // Input2 quantization info
+ const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset);
+ const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale);
+
+ // Clear X Dimension on execution window as we handle manually
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const QuantizationInfo input1_qinfo = in1->info()->quantization_info();
+ const QuantizationInfo input2_qinfo = in2->info()->quantization_info();
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
+ vscale1, vscale2, voffseto, invvscaleo);
+ for(; x < window_end_x; ++x)
+ {
+ const float afs = scvt_f32_qasymm8(*(input1_ptr + x), input1_qinfo.scale, input1_qinfo.offset);
+ const float bfs = scvt_f32_qasymm8(*(input2_ptr + x), input2_qinfo.scale, input2_qinfo.offset);
+ *(output_ptr + x) = (*scalar_func)(afs, bfs, out->info()->quantization_info());
+ }
+ },
+ input1, input2, output);
+ }
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+ &elementwise_comp_op_scalar<op, InputScalarType>,
+ &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
+ &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+ &elementwise_comp_op_scalar<op, InputScalarType>,
+ &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
+ &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ elementwise_op<ScalarType, ScalarType, VectorType>(in1, in2, out, window,
+ &elementwise_arithm_op_scalar<op, ScalarType>,
+ &elementwise_arithm_op_broadcast_loop<op, ScalarType, VectorType>,
+ &elementwise_arithm_op_loop<op, ScalarType, VectorType>);
+}
+
+template <ArithmeticOperation op>
+void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
+ &elementwise_arithm_op_quantized_broadcast_loop<op>,
+ &elementwise_arithm_op_quantized_loop<op>);
+}
+
+template <ComparisonOperation op>
+void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
+ &elementwise_comp_op_quantized_broadcast_loop<op>,
+ &elementwise_comp_op_quantized_loop<op>);
+}
+
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
+configure_func(const ITensor *input1, const ITensor *input2, ITensor *output,
+ std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+{
+ std::string function_to_call("op_");
+ function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+ function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+ function_to_call += string_from_data_type(output->info()->data_type());
+
+ auto it = map_function.find(function_to_call);
+
+ if(it != map_function.end())
+ {
+ auto func = it->second;
+ return [func](const ITensor * input1, const ITensor * input2, ITensor * output, const Window & window)
+ {
+ func(input1, input2, output, window);
+ };
+ }
+ return nullptr;
+}
+
+template <ArithmeticOperation op>
+std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
+configure_arithm_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+ static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function =
+ {
+ { "op_F32_F32_F32", &elementwise_arithm_op<op, float, float32x4_t> },
+ { "op_S16_S16_S16", &elementwise_arithm_op<op, int16_t, int16x8_t> },
+ { "op_S32_S32_S32", &elementwise_arithm_op<op, int32_t, int32x4_t> },
+ { "op_QASYMM8_QASYMM8_QASYMM8", &elementwise_arithm_op_quantized<op> }
+ };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ map_function["op_F16_F16_F16"] = &elementwise_arithm_op<op, float16_t, float16x8_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+ return configure_func(input1, input2, output, map_function);
+}
+
+template <ComparisonOperation op>
+std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)>
+configure_comp_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+ static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function =
+ {
+ { "op_F32_F32_U8", &elementwise_comp_op_32<op, float, float32x4_t> },
+ { "op_S16_S16_U8", &elementwise_comp_op_16<op, int16_t, int16x8_t> },
+ { "op_S32_S32_U8", &elementwise_comp_op_32<op, int32_t, int32x4_t> },
+ { "op_QASYMM8_QASYMM8_U8", &elementwise_comp_op_quantized<op> }
+ };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ map_function["op_F16_F16_U8"] = &elementwise_comp_op_16<op, float16_t, float16x8_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+ return configure_func(input1, input2, output, map_function);
+}
+} // namespace
+
+NEElementwiseOperationKernel::NEElementwiseOperationKernel()
+ : _function(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+Status NEElementwiseOperationKernel::validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+
+ const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+ // Validate in case of configured output
+ if(output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+ "Wrong shape for output");
+ }
+
+ return Status{};
+}
+
+void NEElementwiseOperationKernel::configure_common(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ // Configure kernel window
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ auto_init_if_empty(*output->info(), out_shape, 1, input1->info()->data_type());
+
+ Window win = calculate_max_window(valid_region);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ INEKernel::configure(win);
+}
+
+void NEElementwiseOperationKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info, window);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_function == nullptr);
+ _function(_input1, _input2, _output, window);
+}
+
+/** Arithmetic operators (min, max, squared_diff) */
+
+void NEArithmeticOperationKernel::configure(ArithmeticOperation op, const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+ configure_common(input1, input2, output);
+ switch(op)
+ {
+ case ArithmeticOperation::MAX:
+ _function = configure_arithm_func<ArithmeticOperation::MAX>(input1, input2, output);
+ break;
+ case ArithmeticOperation::MIN:
+ _function = configure_arithm_func<ArithmeticOperation::MIN>(input1, input2, output);
+ break;
+ case ArithmeticOperation::SQUARED_DIFF:
+ _function = configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(input1, input2, output);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+}
+
+Status NEArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ // Validate in case of configured output
+ if(output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
+ }
+ return validate_arguments_common(input1, input2, output);
+}
+
+Status NEArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+ return Status{};
+}
+
+/** The division operator */
+
+void NEDivisionOperationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+ configure_common(input1, input2, output);
+ _function = configure_arithm_func<ArithmeticOperation::DIV>(input1, input2, output);
+}
+
+Status NEDivisionOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::F16, DataType::F32);
+ return NEArithmeticOperationKernel::validate_arguments(input1, input2, output);
+}
+
+Status NEDivisionOperationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+ return Status{};
+}
+
+/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
+
+void NEComparisonOperationKernel::configure(ComparisonOperation op, const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+ configure_common(input1, input2, output);
+ switch(op)
+ {
+ case ComparisonOperation::Equal:
+ _function = configure_comp_func<ComparisonOperation::Equal>(input1, input2, output);
+ break;
+ case ComparisonOperation::NotEqual:
+ _function = configure_comp_func<ComparisonOperation::NotEqual>(input1, input2, output);
+ break;
+ case ComparisonOperation::Greater:
+ _function = configure_comp_func<ComparisonOperation::Greater>(input1, input2, output);
+ break;
+ case ComparisonOperation::GreaterEqual:
+ _function = configure_comp_func<ComparisonOperation::GreaterEqual>(input1, input2, output);
+ break;
+ case ComparisonOperation::Less:
+ _function = configure_comp_func<ComparisonOperation::Less>(input1, input2, output);
+ break;
+ case ComparisonOperation::LessEqual:
+ _function = configure_comp_func<ComparisonOperation::LessEqual>(input1, input2, output);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+}
+
+Status NEComparisonOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+ // Validate in case of configured output
+ if(output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
+ }
+ return validate_arguments_common(input1, input2, output);
+}
+
+Status NEComparisonOperationKernel::validate(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
+ return Status{};
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
new file mode 100644
index 0000000..7ecc4d1
--- /dev/null
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+class Coordinates;
+
+namespace
+{
+template <ElementWiseUnary op, typename ScalarType>
+inline ScalarType elementwise_op_scalar(const ScalarType &a)
+{
+ switch(op)
+ {
+ case ElementWiseUnary::RSQRT:
+ return 1 / sqrt(a);
+ case ElementWiseUnary::EXP:
+ return std::exp(a);
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+}
+
+template <ElementWiseUnary op, typename VectorType>
+inline VectorType elementwise_op(const VectorType &a)
+{
+ switch(op)
+ {
+ case ElementWiseUnary::RSQRT:
+ return wrapper::vinvsqrt(a);
+ case ElementWiseUnary::EXP:
+ return wrapper::vexpq(a);
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+}
+
+template <ElementWiseUnary op, typename ScalarType>
+void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
+{
+ const int window_step_x = 16 / sizeof(ScalarType);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(in, win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+
+ int x = window_start_x;
+ for(; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(output_ptr + x, elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
+ }
+ for(; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
+ }
+ },
+ input, output);
+}
+
+template <ElementWiseUnary op>
+std::function<void(const ITensor *input, ITensor *output, const Window &window)>
+configure_func(const ITensor *input, ITensor *output)
+{
+ std::string function_to_call("op_");
+ function_to_call += string_from_data_type(input->info()->data_type()) + "_";
+ function_to_call += string_from_data_type(output->info()->data_type());
+
+ static std::map<std::string, NEElementwiseUnaryKernel::ElementwiseUnaryFunction *> map_function =
+ {
+ { "op_F32_F32", &elementwise_op<op, float> }
+ };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+ auto it = map_function.find(function_to_call);
+
+ if(it != map_function.end())
+ {
+ auto func = it->second;
+ return [func](const ITensor * input, ITensor * output, const Window & window)
+ {
+ func(input, output, window);
+ };
+ }
+ return nullptr;
+}
+} // namespace
+
+NEElementwiseUnaryKernel::NEElementwiseUnaryKernel()
+ : _function(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+void NEElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Configure kernel window
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input->info());
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+
+ Window win = calculate_max_window(valid_region);
+
+ _input = input;
+ _output = output;
+
+ INEKernel::configure(win);
+
+ switch(op)
+ {
+ case ElementWiseUnary::RSQRT:
+ _function = configure_func<ElementWiseUnary::RSQRT>(input, output);
+ break;
+ case ElementWiseUnary::EXP:
+ _function = configure_func<ElementWiseUnary::EXP>(input, output);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+}
+
+Status NEElementwiseUnaryKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32);
+
+ // Validate in case of configured output
+ if(output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+ }
+
+ return Status{};
+}
+
+Status NEElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+ return Status{};
+}
+
+void NEElementwiseUnaryKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_function == nullptr);
+ _function(_input, _output, window);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index aef4d48..f4046e0 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,15 +34,12 @@
#include <algorithm>
#include <cstdint>
-using namespace arm_compute;
-
+namespace arm_compute
+{
+class Coordinates;
namespace
{
-template <typename T, unsigned int leftx, unsigned int rightx>
-void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value);
-
-template <>
-inline void fill_constant_value_single_channel_special<float, 1u, 1u>(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
{
float border_value;
constant_border_value.get(border_value);
@@ -93,11 +90,6 @@
}
} // namespace
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
NEFillBorderKernel::NEFillBorderKernel()
: _tensor(nullptr), _border_size(0), _mode(BorderMode::UNDEFINED), _constant_border_value(static_cast<float>(0.f))
{
@@ -142,81 +134,19 @@
{
case BorderMode::CONSTANT:
{
- switch(_tensor->info()->data_type())
+ if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
{
- case DataType::QASYMM8:
- case DataType::U8:
- fill_constant_value_single_channel<uint8_t>(window);
- break;
- case DataType::S8:
- fill_constant_value_single_channel<int8_t>(window);
- break;
- case DataType::U16:
- fill_constant_value_single_channel<uint16_t>(window);
- break;
- case DataType::S16:
- fill_constant_value_single_channel<int16_t>(window);
- break;
- case DataType::U32:
- fill_constant_value_single_channel<uint32_t>(window);
- break;
- case DataType::S32:
- fill_constant_value_single_channel<int32_t>(window);
- break;
- case DataType::F16:
- static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
- fill_constant_value_single_channel<half>(window);
- break;
- case DataType::F32:
- static_assert(sizeof(float) == 4, "Float must be 32 bit");
- if(_border_size.left == 1 && _border_size.top == 1)
- {
- fill_constant_value_single_channel_special<float, 1u, 1u>(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
- }
- else
- {
- fill_constant_value_single_channel<float>(window);
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Not handled");
+ fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+ }
+ else
+ {
+ fill_constant_value_single_channel(window);
}
break;
}
case BorderMode::REPLICATE:
{
- switch(_tensor->info()->data_type())
- {
- case DataType::QASYMM8:
- case DataType::U8:
- fill_replicate_single_channel<uint8_t>(window);
- break;
- case DataType::S8:
- fill_replicate_single_channel<int8_t>(window);
- break;
- case DataType::U16:
- fill_replicate_single_channel<uint16_t>(window);
- break;
- case DataType::S16:
- fill_replicate_single_channel<int16_t>(window);
- break;
- case DataType::U32:
- fill_replicate_single_channel<uint32_t>(window);
- break;
- case DataType::S32:
- fill_replicate_single_channel<int32_t>(window);
- break;
- case DataType::F16:
- static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
- fill_replicate_single_channel<half>(window);
- break;
- case DataType::F32:
- static_assert(sizeof(float) == 4, "Float must be 32 bit");
- fill_replicate_single_channel<float>(window);
- break;
- default:
- ARM_COMPUTE_ERROR("Not handled");
- }
+ fill_replicate_single_channel(window);
break;
}
case BorderMode::UNDEFINED:
@@ -226,13 +156,12 @@
}
}
-template <typename T>
void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
{
uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
const size_t width = _tensor->info()->valid_region().shape[0];
const size_t height = _tensor->info()->valid_region().shape[1];
-
+ const size_t element_size = _tensor->info()->element_size();
// Left and right border
Window vertical(window);
vertical.set(Window::DimY, Window::Dimension(0, height, 1));
@@ -241,72 +170,18 @@
execute_window_loop(vertical, [&](const Coordinates & id)
{
- const auto row_start = reinterpret_cast<T *>(start_valid_region + vertical_it.offset());
- const auto left_val = *reinterpret_cast<T *>(vertical_it.ptr());
- const auto right_val = *(reinterpret_cast<T *>(vertical_it.ptr()) + width - 1);
-
+ uint8_t *base_addr = start_valid_region + vertical_it.offset();
// Fill left and right borders
- std::fill_n(row_start - _border_size.left, _border_size.left, left_val);
- std::fill_n(row_start + width, _border_size.right, right_val);
- },
- vertical_it);
-
- // Top and bottom border
- Iterator plane_it(_tensor, window);
-
- // Iterate over all XY planes
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto first_row = reinterpret_cast<T *>(start_valid_region + plane_it.offset());
-
- // Top border
- for(int i = -_border_size.top; i < 0; ++i)
+ for(unsigned int i = 0; i < _border_size.left; ++i)
{
- const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
-
- // Copy top rows including left/right borders
- std::copy_n(first_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left);
+ std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), element_size);
}
- const auto last_row = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + (height - 1) * _tensor->info()->strides_in_bytes()[1]);
-
- // Bottom border
- for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+ for(unsigned int i = 0; i < _border_size.right; ++i)
{
- const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
-
- // Copy bottom rows including left/right borders
- std::copy_n(last_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left);
+ std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size);
}
},
- plane_it);
-}
-
-template <typename T>
-void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
-{
- T constant_border_value;
- _constant_border_value.get(constant_border_value);
-
- uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
- const size_t width = _tensor->info()->valid_region().shape[0];
- const size_t height = _tensor->info()->valid_region().shape[1];
- const int stridey = _tensor->info()->strides_in_bytes()[1];
-
- // Left and right border
- Window vertical(window);
- vertical.set(Window::DimY, Window::Dimension(0, height, 1));
-
- Iterator vertical_it(_tensor, vertical);
-
- execute_window_loop(vertical, [&](const Coordinates & id)
- {
- const auto row_start = reinterpret_cast<T *>(start_valid_region + vertical_it.offset());
-
- // Fill left and right borders
- std::fill_n(row_start - _border_size.left, _border_size.left, constant_border_value);
- std::fill_n(row_start + width, _border_size.right, constant_border_value);
- },
vertical_it);
// Top and bottom border
@@ -319,21 +194,80 @@
// Top border
for(int i = -_border_size.top; i < 0; ++i)
{
- const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
+ // Copy top rows including left/right borders
+ std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+ base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
+ }
+ // Bottom border
+ for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+ {
+ // Copy bottom rows including left/right borders
+ std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+ base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
+ }
+ },
+ plane_it);
+}
+
+void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
+{
+ uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
+ const size_t width = _tensor->info()->valid_region().shape[0];
+ const size_t height = _tensor->info()->valid_region().shape[1];
+ const int stridey = _tensor->info()->strides_in_bytes()[1];
+ const size_t element_size = _tensor->info()->element_size();
+
+ // Left and right border
+ Window vertical(window);
+ vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+ Iterator vertical_it(_tensor, vertical);
+
+ execute_window_loop(vertical, [&](const Coordinates & id)
+ {
+ uint8_t *base_addr = start_valid_region + vertical_it.offset();
+ // Fill left and right borders
+ for(unsigned int i = 0; i < _border_size.left; ++i)
+ {
+ std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, element_size);
+ }
+
+ for(unsigned int i = 0; i < _border_size.right; ++i)
+ {
+ std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
+ }
+ },
+ vertical_it);
+
+ // Top and bottom border
+ Iterator plane_it(_tensor, window);
+
+ // Iterate over all XY planes
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
+ // Top border
+ for(int i = -_border_size.top; i < 0; ++i)
+ {
// Fill top rows including left/right borders
- std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
+ for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ {
+ std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+ }
}
// Bottom border
const unsigned low_border_size = height + _border_size.bottom;
for(unsigned int i = height; i < low_border_size; ++i)
{
- const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
-
// Fill bottom rows including left/right borders
- std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
+ for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ {
+ std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+ }
}
},
plane_it);
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
index b8452fb..4840a95 100644
--- a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,6 +46,7 @@
DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
+ // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
// Checks performed when output is configured
@@ -55,6 +56,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
new file mode 100644
index 0000000..e699bac
--- /dev/null
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "utils/TypePrinter.h"
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+ const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+ float epsilon)
+{
+ ARM_COMPUTE_UNUSED(epsilon);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(conv_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(conv_weights, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_var);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_mean, bn_var);
+
+ unsigned int kernels_idx = get_data_layout_dimension_index(conv_weights->data_layout(), DataLayoutDimension::BATCHES);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_weights->dimension(kernels_idx) != bn_mean->dimension(0));
+
+ // Validate bias
+ if(conv_bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, conv_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, conv_bias);
+ }
+ // Validate beta
+ if(bn_beta != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_beta);
+ }
+ // Validate gamma
+ if(bn_gamma != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_gamma);
+ }
+
+ // Validate output weights
+ if(fused_weights != nullptr && fused_weights->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(conv_weights, fused_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(conv_weights, fused_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, fused_weights);
+ }
+ // Validate output bias
+ if(fused_bias != nullptr && fused_bias->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, fused_bias);
+ }
+
+ return Status{};
+}
+
+template <typename ScalarType, int size>
+void fused_batch_normmalization(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
+ const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+{
+ using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
+
+ const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
+ const bool run_in_place_bias = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
+
+ // Set build options
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const int window_step_x = size;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ Iterator conv_w_in(conv_weights, win);
+ Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win);
+
+ const auto conv_bias_in = (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+ auto conv_bias_out = (run_in_place_bias ? conv_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+
+ int slice = -1;
+
+ const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+
+ auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+ auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+ auto gamma_vec = wrapper::vdup_n(ScalarType(1), ExactTagType{});
+ auto beta_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+ auto rvar_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
+ const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{});
+
+ auto mean = ScalarType(0.0);
+ auto var = ScalarType(0.0);
+ auto gamma = ScalarType(1.0);
+ auto beta = ScalarType(0.0);
+ auto conv_bias_in_scalar = ScalarType(0.0);
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ if(slice != id[3])
+ {
+ slice = id[3];
+ mean = input_mean[slice];
+ var = input_var[slice];
+ gamma = ScalarType(1.0);
+ beta = ScalarType(0.0);
+
+ // Construct vectors
+ mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+ var_vec = wrapper::vdup_n(var, ExactTagType{});
+ if(input_gamma != nullptr)
+ {
+ gamma = input_gamma[slice];
+ gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+ }
+ if(input_beta != nullptr)
+ {
+ beta = input_beta[slice];
+ beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ }
+ if(conv_bias_in != nullptr)
+ {
+ conv_bias_in_scalar = conv_bias_in[slice];
+ }
+ else
+ {
+ conv_bias_in_scalar = ScalarType(0);
+ }
+
+ conv_bias_in_scalar = (conv_bias_in_scalar - mean) / sqrt(var + ScalarType(epsilon));
+ conv_bias_in_scalar = (conv_bias_in_scalar * gamma) + beta;
+ conv_bias_out[slice] = conv_bias_in_scalar;
+ rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+ }
+
+ int x = window_start_x;
+ auto conv_w_in_ptr = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
+ auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
+
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ auto wn = wrapper::vloadq(conv_w_in_ptr + x);
+ wn = wrapper::vmul(wn, rvar_vec);
+ wn = wrapper::vmul(wn, gamma_vec);
+
+ // Store results
+ wrapper::vstore(conv_w_out_ptr + x, wn);
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / sqrt(var + ScalarType(epsilon)) * gamma;
+ }
+ },
+ conv_w_in, conv_w_out);
+}
+} // namespace
+
+NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel()
+ : _conv_weights(nullptr), _conv_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
+ _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr)
+{
+}
+
+void NEFuseBatchNormalizationKernel::configure(const ITensor *conv_weights, const ITensor *bn_mean, const ITensor *bn_var,
+ ITensor *fused_weights, ITensor *fused_bias,
+ const ITensor *conv_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
+ float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(conv_weights, bn_mean, bn_var);
+
+ _conv_weights = conv_weights;
+ _conv_bias = conv_bias;
+ _bn_mean = bn_mean;
+ _bn_var = bn_var;
+ _bn_beta = bn_beta;
+ _bn_gamma = bn_gamma;
+ _fused_weights = fused_weights;
+ _fused_bias = fused_bias;
+ _epsilon = epsilon;
+
+ _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
+ _run_in_place_bias = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
+
+ // Auto initialize outputs
+ if(_fused_weights != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*_fused_weights->info(), *_conv_weights->info()->clone());
+ fused_weights->info()->set_valid_region(conv_weights->info()->valid_region());
+ }
+ if(_fused_bias != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
+ _fused_bias->info()->set_valid_region(bn_mean->info()->valid_region());
+ }
+
+ // Validate arguments
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(conv_weights->info(), bn_mean->info(), bn_var->info(),
+ (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+ (fused_bias != nullptr) ? fused_bias->info() : nullptr,
+ (conv_bias != nullptr) ? conv_bias->info() : nullptr,
+ (bn_beta != nullptr) ? bn_beta->info() : nullptr,
+ (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
+ epsilon));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*conv_weights->info());
+ INEKernel::configure(win);
+
+ // Configure function to run based on different data types
+ const DataType data_type = _conv_weights->info()->data_type();
+ switch(data_type)
+ {
+ case DataType::F32:
+ _func = &fused_batch_normmalization<float, 4>;
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ _func = &fused_batch_normmalization<float16_t, 8>;
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ default:
+ ARM_COMPUTE_ERROR("Not Supported");
+ break;
+ }
+}
+
+Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+ const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+ float epsilon)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon));
+ return Status{};
+}
+
+void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+ (*_func)(_conv_weights, _conv_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 5483602..7769d9e 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,6 +57,7 @@
output_shape.set(1, std::ceil(input->dimension(1) / 4.0f));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 024c4f8..f0ac695 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,37 +66,20 @@
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
{
- // Note: This kernel performs 16 elements per iteration.
- // However, since we use a left-over for loop, we cannot have any read or write out of memory
- // For this reason num_elems_processed_per_iteration is set to 1
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
// Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input, Steps());
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ // NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
- bool window_changed = update_window_and_padding(win,
- input_access);
-
- if(output->total_size() != 0)
- {
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
- }
-
- if(bias != nullptr)
- {
- AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
- window_changed = window_changed || update_window_and_padding(win, bias_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(Status{}, win);
}
} // namespace
@@ -269,7 +252,7 @@
_max = max;
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+ auto win_config = validate_and_configure_window(input->info(), output->info());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
@@ -282,10 +265,7 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- (bias != nullptr) ? bias->clone().get() : nullptr,
- output->clone().get())
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
return Status{};
}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index 2387869..cba3390 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
@@ -43,11 +44,12 @@
{
Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != DataType::F32));
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != input0->data_type()));
ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
@@ -87,6 +89,48 @@
namespace arm_compute
{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>(const Window &window_in,
+ const Window &window_w,
+ const Window &window_out)
+{
+ Iterator in(_input0, window_in);
+ Iterator in2(_input1, window_w);
+ Iterator out(_output, window_out);
+
+ const int input_w = _input0->info()->dimension(0);
+ const int input_h = _input0->info()->dimension(1);
+ const int input_stride_x = _input0->info()->strides_in_bytes().x();
+ const int weights_stride_x = _input1->info()->strides_in_bytes().x();
+ const int weights_stride_y = _input1->info()->strides_in_bytes().y();
+ const int output_stride_x = _output->info()->strides_in_bytes().x();
+
+ execute_window_loop(window_in, [&](const Coordinates & id)
+ {
+ // Get pointers
+ const uint8_t *const input_ptr = in.ptr();
+ const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
+ auto output_ptr = reinterpret_cast<__fp16 *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
+
+ float16x8_t row_dot = vdupq_n_f16(0.f);
+ for(int i = 0; i < input_w; i += 8)
+ {
+ const auto input = vld1q_f16(reinterpret_cast<const __fp16 *>(input_ptr + i * input_stride_x));
+ const auto weights = vld1q_f16(reinterpret_cast<const __fp16 *>(weights_ptr + i * weights_stride_x));
+ row_dot = vaddq_f16(row_dot, vmulq_f16(input, weights));
+ }
+
+ auto temp = vadd_f16(vget_high_f16(row_dot), vget_low_f16(row_dot));
+ temp = vpadd_f16(temp, temp);
+ temp = vpadd_f16(temp, temp);
+
+ *output_ptr = vget_lane_f16(temp, 0);
+ },
+ in, in2, out);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
template <>
void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>(const Window &window_in,
const Window &window_w,
@@ -226,6 +270,11 @@
case DataType::QASYMM8:
_func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>;
break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>;
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
_func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>;
break;
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 2e14e7a..38503b7 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -64,6 +64,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
new file mode 100644
index 0000000..1e027b7
--- /dev/null
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Validate the indices
+ *
+ * Validate that indices are not negative
+ *
+ * @param[in] indices Indices tensor info.
+ */
+template <typename U>
+void validate_indices(const ITensor *indices)
+{
+ for(size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0);
+ }
+}
+
+} // namespace
+
+NEGatherKernel::NEGatherKernel()
+ : _input{}, _indices{}, _axis{}, _output{}, _func{}
+{
+}
+
+template <typename U>
+inline void NEGatherKernel::gather_0_axis(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+
+ // Validate that the indices are not negative
+ validate_indices<U>(_indices);
+
+ Iterator output_it(_output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ Coordinates gather_id(id);
+
+ auto new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+ gather_id.set(0, new_index);
+
+ std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), output_it.ptr());
+ },
+ output_it);
+}
+
+template <typename U>
+void NEGatherKernel::gather_n_axis(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+
+ // Validate that the indices are not negative
+ validate_indices<U>(_indices);
+
+ Window output_window{ window };
+ output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator output_it(_output, output_window);
+ execute_window_loop(output_window, [&](const Coordinates & id)
+ {
+ Coordinates gather_id(id);
+
+ auto new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+ gather_id.set(_axis, new_index);
+
+ std::copy_n(_input->ptr_to_element(gather_id), _input->info()->dimension(0) * _output->info()->element_size(), output_it.ptr());
+ },
+ output_it);
+}
+
+void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+ ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() != 1);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ _input = input;
+ _indices = indices;
+ _output = output;
+ _axis = axis;
+
+ if(_axis < 0)
+ {
+ _axis += input->info()->num_dimensions();
+ }
+ ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
+
+ if(0 == _axis)
+ {
+ switch(_indices->info()->data_type())
+ {
+ case DataType::U32:
+ _func = &NEGatherKernel::gather_0_axis<uint32_t>;
+ break;
+ case DataType::S32:
+ _func = &NEGatherKernel::gather_0_axis<int32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+ }
+ else
+ {
+ switch(_indices->info()->data_type())
+ {
+ case DataType::U32:
+ _func = &NEGatherKernel::gather_n_axis<uint32_t>;
+ break;
+ case DataType::S32:
+ _func = &NEGatherKernel::gather_n_axis<int32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+ }
+ // Output auto initialization if not yet initialized
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+ // Create window
+ Window win = calculate_max_window(*output->info(), Steps());
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ if(axis < 0)
+ {
+ axis += input->num_dimensions();
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+ return Status{};
+}
+
+void NEGatherKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ (this->*_func)(window, info);
+}
+
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 2c51eae..2e3d9de 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,6 +59,7 @@
TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index ed03783..cda041d 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -32,15 +32,20 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
#include <cmath>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
+template <typename T, int S>
void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
{
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
Window window_sum(window);
window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -53,30 +58,97 @@
Iterator sum_it(sum, sum_slice);
Iterator output_it(out, in_slice);
- const float sum_value = *reinterpret_cast<const float *>(sum_it.ptr());
- const float32x4_t vec_normalize_value = vdupq_n_f32(1.f / std::sqrt(std::max(sum_value, epsilon)));
+ const auto sum_value = *reinterpret_cast<const T *>(sum_it.ptr());
+ const auto vec_normalize_value = wrapper::vdup_n(static_cast<T>(1.f / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)))), ExactTagType{});
execute_window_loop(in_slice, [&](const Coordinates & id)
{
- const auto in_ptr = reinterpret_cast<const float *>(input_it.ptr());
- const auto out_ptr = reinterpret_cast<float *>(output_it.ptr());
+ const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
- vst1q_f32(out_ptr, vmulq_f32(vld1q_f32(in_ptr), vec_normalize_value));
+ wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
},
input_it, output_it);
}
while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
}
+template <typename T, int S>
+void l2_normalize_Y(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+ Window window_sum(window);
+ window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ Window in_slice = window.first_slice_window_2D();
+ Window sum_slice = window_sum.first_slice_window_2D();
+
+ do
+ {
+ Iterator input_it(in, in_slice);
+ Iterator sum_it(sum, sum_slice);
+ Iterator output_it(out, in_slice);
+
+ auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
+ const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+ const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
+ wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
+ },
+ input_it, sum_it, output_it);
+ }
+ while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+}
+
+template <typename T, int S>
+void l2_normalize_Z(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+ Window window_sum(window);
+ window_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window in_slice = window.first_slice_window_3D();
+ Window sum_slice = window_sum.first_slice_window_3D();
+
+ do
+ {
+ Iterator input_it(in, in_slice);
+ Iterator sum_it(sum, sum_slice);
+ Iterator output_it(out, in_slice);
+
+ auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
+ const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+ const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
+ wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
+ },
+ input_it, sum_it, output_it);
+ }
+ while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+}
+
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 2, "Axis greater than 2 is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
// Reduce shape on axis
@@ -89,7 +161,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
}
return Status{};
@@ -158,9 +230,52 @@
switch(_axis)
{
case 0:
- l2_normalize_X(_input, _sum, _output, _epsilon, window);
+ switch(_input->info()->data_type())
+ {
+ case DataType::F32:
+ l2_normalize_X<float, 4>(_input, _sum, _output, _epsilon, window);
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ l2_normalize_X<float16_t, 8>(_input, _sum, _output, _epsilon, window);
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ break;
+ case 1:
+ switch(_input->info()->data_type())
+ {
+ case DataType::F32:
+ l2_normalize_Y<float, 4>(_input, _sum, _output, _epsilon, window);
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ l2_normalize_Y<float16_t, 8>(_input, _sum, _output, _epsilon, window);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ break;
+ case 2:
+ switch(_input->info()->data_type())
+ {
+ case DataType::F32:
+ l2_normalize_Z<float, 4>(_input, _sum, _output, _epsilon, window);
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ l2_normalize_Z<float16_t, 8>(_input, _sum, _output, _epsilon, window);
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported normalization axis");
}
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMemsetKernel.cpp b/src/core/NEON/kernels/NEMemsetKernel.cpp
new file mode 100644
index 0000000..2b57b15
--- /dev/null
+++ b/src/core/NEON/kernels/NEMemsetKernel.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+NEMemsetKernel::NEMemsetKernel()
+ : _tensor(nullptr), _constant_value()
+{
+}
+
+void NEMemsetKernel::configure(ITensor *tensor, const PixelValue &constant_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+ _tensor = tensor;
+ _constant_value = constant_value;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*tensor->info(), Steps());
+ INEKernel::configure(win);
+}
+
+void NEMemsetKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ // Collapse all the batches on the third dimension
+ bool has_collapsed = true;
+ Window collapsed = window.collapse_if_possible(window, Window::DimZ, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+ uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
+ const auto window_width = static_cast<int>(collapsed.x().end()) - static_cast<int>(collapsed.x().start());
+ const size_t element_size = _tensor->info()->element_size();
+
+ // Unroll X dimension
+ collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator tensor_it(_tensor, collapsed);
+ execute_window_loop(collapsed, [&](const Coordinates & id)
+ {
+ uint8_t *base_addr = start_valid_region + tensor_it.offset();
+ // Set memory
+ for(int i = 0; i < window_width; ++i)
+ {
+ std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
+ }
+
+ },
+ tensor_it);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 27af121..e5f6e4f 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -29,6 +29,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
@@ -44,8 +45,6 @@
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && norm_info.type() == NormType::IN_MAP_2D,
- "Only Cross-map and 1D In-map normalization is supported for NHWC layout");
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
@@ -55,6 +54,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
}
return Status{};
@@ -143,16 +143,26 @@
{
if(norm_info.type() == NormType::IN_MAP_2D)
{
- _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, true>;
+ _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>;
}
else
{
- _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, false>;
+ _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, false>;
}
break;
}
+ case 1:
+ if(norm_info.type() == NormType::IN_MAP_2D)
+ {
+ _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>;
+ }
+ else
+ {
+ _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, false>;
+ }
+ break;
case 2:
- _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 2, false>;
+ _func = &NENormalizationLayerKernel::normalize_float<float, 4, 2, false>;
break;
default:
break;
@@ -168,16 +178,26 @@
{
if(norm_info.type() == NormType::IN_MAP_2D)
{
- _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, true>;
+ _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>;
}
else
{
- _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, false>;
+ _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, false>;
}
break;
}
+ case 1:
+ if(norm_info.type() == NormType::IN_MAP_2D)
+ {
+ _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>;
+ }
+ else
+ {
+ _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, false>;
+ }
+ break;
case 2:
- _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 2, false>;
+ _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 2, false>;
break;
default:
break;
@@ -195,14 +215,17 @@
INEKernel::configure(win_config.second);
}
-template <DataType dt, unsigned int dim, bool do_2D_norm>
+template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
void NENormalizationLayerKernel::normalize_float(const Window &window)
{
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
Iterator input(_input, window);
Iterator input_squared(_input_squared, window);
Iterator output(_output, window);
- const int dim_y = 1;
+ const int dim_y = _input->info()->data_layout() == DataLayout::NCHW ? 1 : 2;
const int radius = _norm_info.norm_size() / 2;
const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
// We account padding across X only and we iterate over rows
@@ -210,83 +233,39 @@
const int max_right = _input->info()->dimension(dim) - 1;
const int max_bottom = _input->info()->dimension(dim_y) - 1;
- if(dt == DataType::F32)
- {
- const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
- const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta());
- const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
+ const auto coeff_vec = wrapper::vdup_n(static_cast<T>(_norm_info.scale_coeff()), ExactTagType{});
+ const auto beta_vec = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
+ const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ // Accumulate 2D In-Map values
+ auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ for(int j = first_row; j <= last_row; j++)
{
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int current_slice = id[dim];
- const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
- const int first_slice = std::max(current_slice - radius, min_left);
- const int last_slice = std::min(current_slice + radius, max_right);
-
- // Accumulate 2D In-Map values
- float32x4_t accu = vdupq_n_f32(0.f);
- for(int j = first_row; j <= last_row; j++)
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for(int i = first_slice; i <= last_slice; ++i)
{
- // Compute row displacement
- const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
- const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
- for(int i = first_slice; i <= last_slice; ++i)
- {
- accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
- }
+ accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + i * input_squared_stride)));
}
+ }
- // Normalize
- const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
- const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
- vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
- },
- input, input_squared, output);
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- else if(dt == DataType::F16)
- {
- const float16x8_t coeff_vec = vdupq_n_f16(_norm_info.scale_coeff());
- const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta());
- const float16x8_t kappa_vec = vdupq_n_f16(_norm_info.kappa());
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int current_slice = id[dim];
- const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
- const int first_slice = std::max(current_slice - radius, min_left);
- const int last_slice = std::min(current_slice + radius, max_right);
-
- // Accumulate 2D In-Map values
- float16x8_t accu = vdupq_n_f16(0.f);
- for(int j = first_row; j <= last_row; j++)
- {
- // Compute row displacement
- const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
- const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
- for(int i = first_slice; i <= last_slice; ++i)
- {
- accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>(input_squared_ptr + i * input_squared_stride)));
- }
- }
-
- const float16x8_t norm_f16 = vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16);
- const float16x8_t normalized_pixel = vmulq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16));
- vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel);
- },
- input, input_squared, output);
- }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- else
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
+ // Normalize
+ const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+ const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(reinterpret_cast<const T *>(input.ptr())), wrapper::vinv(normalized));
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
}
Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index 29e6d50..1df94ae 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,6 +43,52 @@
namespace
{
+inline bool is_permutation_supported(const PermutationVector &v)
+{
+ static const std::array<PermutationVector, 6> permutations3 =
+ {
+ {
+ PermutationVector(2U, 0U, 1U),
+ PermutationVector(1U, 2U, 0U),
+ PermutationVector(0U, 1U, 2U),
+ PermutationVector(0U, 2U, 1U),
+ PermutationVector(1U, 0U, 2U),
+ PermutationVector(2U, 1U, 0U),
+ }
+ };
+ static const std::array<PermutationVector, 24> permutations4 =
+ {
+ {
+ PermutationVector(0U, 1U, 2U, 3U),
+ PermutationVector(1U, 0U, 2U, 3U),
+ PermutationVector(2U, 0U, 1U, 3U),
+ PermutationVector(0U, 2U, 1U, 3U),
+ PermutationVector(1U, 2U, 0U, 3U),
+ PermutationVector(2U, 1U, 0U, 3U),
+ PermutationVector(2U, 1U, 3U, 0U),
+ PermutationVector(1U, 2U, 3U, 0U),
+ PermutationVector(3U, 2U, 1U, 0U),
+ PermutationVector(2U, 3U, 1U, 0U),
+ PermutationVector(1U, 3U, 2U, 0U),
+ PermutationVector(3U, 1U, 2U, 0U),
+ PermutationVector(3U, 0U, 2U, 1U),
+ PermutationVector(0U, 3U, 2U, 1U),
+ PermutationVector(2U, 3U, 0U, 1U),
+ PermutationVector(3U, 2U, 0U, 1U),
+ PermutationVector(0U, 2U, 3U, 1U),
+ PermutationVector(2U, 0U, 3U, 1U),
+ PermutationVector(1U, 0U, 3U, 2U),
+ PermutationVector(0U, 1U, 3U, 2U),
+ PermutationVector(3U, 1U, 0U, 2U),
+ PermutationVector(1U, 3U, 0U, 2U),
+ PermutationVector(0U, 3U, 1U, 2U),
+ PermutationVector(3U, 0U, 1U, 2U)
+ }
+ };
+
+ return (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
+}
+
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
{
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
@@ -50,9 +96,8 @@
DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
- && (perm != PermutationVector{ 1U, 2U, 0U }),
- "Only [2, 0, 1] and [1, 2, 0] permutation is supported");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_permutation_supported(perm), "PermutationVector not supported.");
const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
@@ -60,6 +105,7 @@
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -70,12 +116,20 @@
template <typename T>
void NEPermuteKernel::run_permute(const Window &window)
{
+ const DataLayout input_layout = _input->info()->data_layout();
+
// Input window
Window window_in = window;
- window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
- window_in.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
- window_in.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
- window_in.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
+
+ // we only support these two configs in arm_compute/core/NEON/kernels/convolution/common/shims.hpp, for all others
+ // we have to fall back to C++
+ if((input_layout == DataLayout::NCHW && _perm == PermutationVector{ 2U, 0U, 1U }) || (input_layout == DataLayout::NHWC && _perm == PermutationVector{ 1U, 2U, 0U }))
+ {
+ window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
+ window_in.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
+ window_in.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
+ window_in.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
+ }
// Output window
Window window_out(window);
@@ -89,23 +143,53 @@
Iterator in(_input, window_in);
Iterator out(_output, window_out);
- // CHW -> HWC
- if(_perm == PermutationVector{ 2U, 0U, 1U })
- {
- const int in_row_stride = _input->info()->strides_in_bytes().y() / sizeof(T);
- const int in_channel_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
- const int in_batch_stride = _input->info()->strides_in_bytes()[3] / sizeof(T);
+ int in_row_stride = 0;
+ int in_col_stride = 0;
+ int in_channel_stride = 0;
+ int in_batch_stride = 0;
+ int n_cols = 0;
+ int n_rows = 0;
+ int n_channels = 0;
+ int n_batches = 0;
+ switch(input_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ in_row_stride = _input->info()->strides_in_bytes().y() / sizeof(T);
+ in_channel_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
+ in_batch_stride = _input->info()->strides_in_bytes()[3] / sizeof(T);
+ n_cols = _input->info()->tensor_shape().x();
+ n_rows = window_in.y().step();
+ n_channels = _input->info()->tensor_shape().z();
+ n_batches = _input->info()->tensor_shape()[3];
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ in_col_stride = _input->info()->strides_in_bytes().y() / sizeof(T);
+ in_row_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
+ in_batch_stride = _input->info()->strides_in_bytes()[3] / sizeof(T);
+ n_channels = _input->info()->tensor_shape().x();
+ n_cols = window_in.y().step();
+ n_rows = _input->info()->tensor_shape().z();
+ n_batches = _input->info()->tensor_shape()[3];
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Invalid input data layout.");
+ break;
+ }
+ }
+
+ // CHW -> HWC
+ if(input_layout == DataLayout::NCHW && _perm == PermutationVector{ 2U, 0U, 1U })
+ {
const int out_channel_stride = _output->info()->strides_in_bytes().x() / sizeof(T);
const int out_col_stride = _output->info()->strides_in_bytes().y() / sizeof(T);
const int out_row_stride = _output->info()->strides_in_bytes().z() / sizeof(T);
const int out_batch_stride = _output->info()->strides_in_bytes()[3] / sizeof(T);
-
- const int n_cols = _input->info()->tensor_shape().x();
- const int n_rows = window_in.y().step();
- const int n_channels = _input->info()->tensor_shape().z();
- const int n_batches = _input->info()->tensor_shape()[3];
-
execute_window_loop(window_in, [&](const Coordinates & id)
{
const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
@@ -117,22 +201,12 @@
in, out);
}
// HWC -> CHW
- else if(_perm == PermutationVector{ 1U, 2U, 0U })
+ else if(input_layout == DataLayout::NHWC && _perm == PermutationVector{ 1U, 2U, 0U })
{
- const int in_col_stride = _input->info()->strides_in_bytes().y() / sizeof(T);
- const int in_row_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
- const int in_batch_stride = _input->info()->strides_in_bytes()[3] / sizeof(T);
-
const int out_col_stride = _output->info()->strides_in_bytes().x() / sizeof(T);
const int out_row_stride = _output->info()->strides_in_bytes().y() / sizeof(T);
const int out_channel_stride = _output->info()->strides_in_bytes().z() / sizeof(T);
const int out_batch_stride = _output->info()->strides_in_bytes()[3] / sizeof(T);
-
- const int n_channels = _input->info()->tensor_shape().x();
- const int n_cols = window_in.y().step();
- const int n_rows = _input->info()->tensor_shape().z();
- const int n_batches = _input->info()->tensor_shape()[3];
-
execute_window_loop(window_in, [&](const Coordinates & id)
{
const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
@@ -145,7 +219,18 @@
}
else
{
- ARM_COMPUTE_ERROR("Unsupported permutation vector");
+ // All other cases fall back to C++
+ // Permute strides
+ Strides strides = _output->info()->strides_in_bytes();
+ Strides perm_strides = strides;
+ permute_strides(perm_strides, _perm);
+ const int perm_stride_3 = _input->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
+ *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+ },
+ in, out);
}
}
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 310560b..d00a4af 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,8 +52,7 @@
namespace
{
-template <bool exclude_padding, DataLayout data_layout>
-inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -72,8 +71,7 @@
return 1.f / ((end_y - start_y) * (end_x - start_x));
}
-template <bool exclude_padding>
-inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
+inline void scale_vector_s16x8(bool exclude_padding, uint16x8_t &v, const Coordinates &id, int id_offset, int step,
const int pool_size, const int upper_bound_w, const int upper_bound_h,
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
@@ -140,6 +138,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
|| (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
}
@@ -336,13 +335,9 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- const PoolingType pool_type = pool_info.pool_type();
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
- const bool exclude_padding = pool_info.exclude_padding();
const bool is_global_pooling = pool_info.is_global_pooling();
const int pool_stride_x = pad_stride_info.stride().first;
- unsigned int pool_size_x = 0;
- unsigned int pool_size_y = 0;
// Get data layout
const DataLayout data_layout = input->info()->data_layout();
@@ -350,18 +345,19 @@
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Update pool size in case of global pooling
- pool_size_x = is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
- pool_size_y = is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
+ const Size2D pool_size(
+ is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width,
+ is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height);
// Validate pool info before calling scaled_dimensions
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
// Check output dimensions
unsigned int pooled_w, pooled_h;
std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
input->info()->dimension(idx_height),
- pool_size_x,
- pool_size_y,
+ pool_size.x(),
+ pool_size.y(),
pad_stride_info);
// Perform validation step
@@ -371,7 +367,7 @@
_input = input;
_output = output;
_pool_info = pool_info;
- _is_square = (pool_size_x == pool_size_y);
+ _is_square = (pool_size.x() == pool_size.y());
// Get data type
const DataType data_type = input->info()->data_type();
@@ -379,88 +375,37 @@
if(data_type == DataType::QASYMM8)
{
- if(pool_size_x == 2 && pool_stride_x < 3 && _is_square)
+ if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
{
- switch(pool_type)
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::MAX>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
}
}
- else if(pool_size_x == 3 && pool_stride_x < 3 && _is_square)
+ else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
{
- switch(pool_type)
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::MAX>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
}
}
else
{
- switch(pool_type)
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::MAX>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
}
}
}
@@ -468,157 +413,56 @@
{
if(_is_square)
{
- switch(pool_size_x)
+ switch(pool_size.x())
{
case 2:
- switch(pool_type)
+ {
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::L2:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::MAX, false>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
}
- break;
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
+ }
+ }
+ break;
case 3:
- switch(pool_type)
+ {
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::L2:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::MAX, false>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
}
- break;
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
+ }
+ }
+ break;
default:
- switch(pool_type)
+ {
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::L2:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
}
break;
+ }
+ break;
}
}
else
{
- switch(pool_type)
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::L2:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
}
}
}
@@ -626,206 +470,78 @@
{
if(_is_square)
{
- switch(pool_size_x)
+ switch(pool_size.x())
{
case 2:
- switch(pool_type)
+ {
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::L2:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::MAX, false>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
}
break;
+ }
case 3:
- switch(pool_type)
+ {
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::L2:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::MAX, false>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
}
break;
+ }
case 7:
- switch(pool_type)
+ {
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::L2:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::MAX, false>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
}
break;
+ }
default:
- switch(pool_type)
+ {
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::L2:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
}
break;
+ }
}
}
else
{
- switch(pool_type)
+ if(is_nchw)
{
- case PoolingType::AVG:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
- }
- break;
- case PoolingType::L2:
- if(is_nchw)
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
- }
- else
- {
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
- }
- break;
- case PoolingType::MAX:
- if(is_nchw)
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
- }
- else
- {
- _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+ }
+ else
+ {
+ _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
}
}
}
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size_x, pool_size_y);
+ auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -879,9 +595,9 @@
uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
// Scale lower result
- scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ scale_vector_s16x8(exclude_padding, res_lower, id, 0, scale_step_x,
+ pool_size, upper_bound_w, upper_bound_h,
+ pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
lower_res = vmovn_u16(res_lower);
// Compute upper result for stride_x == 1
@@ -907,9 +623,9 @@
uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
// Scale lower result
- scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ scale_vector_s16x8(exclude_padding, res_upper, id, 1, 2,
+ pool_size, upper_bound_w, upper_bound_h,
+ pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
upper_res = vmovn_u16(res_upper);
}
}
@@ -938,9 +654,10 @@
input, output);
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
+ ARM_COMPUTE_UNUSED(pooling_type);
+ ARM_COMPUTE_UNUSED(exclude_padding);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -978,7 +695,7 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float16x4_t scale_v = vdup_n_f16(scale);
// Perform pooling
const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
@@ -1008,9 +725,10 @@
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
+ ARM_COMPUTE_UNUSED(pooling_type);
+ ARM_COMPUTE_UNUSED(exclude_padding);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1042,7 +760,7 @@
if(pooling_type != PoolingType::MAX)
{
- const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float16x4_t scale_v = vdup_n_f16(scale);
const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
@@ -1071,71 +789,7 @@
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window)
-{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
-
- constexpr int pool_size = 2;
- const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
- const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
- const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
- const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
- const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
- float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
- float32x2_t res = {};
- float final_res = 0;
-
- // Get power of 2 in case of l2 pooling
- if(pooling_type == PoolingType::L2)
- {
- top_data = vmul_f32(top_data, top_data);
- bottom_data = vmul_f32(bottom_data, bottom_data);
- }
-
- if(pooling_type != PoolingType::MAX)
- {
- // Calculate scale
- float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- const float32x2_t scale_v = vdup_n_f32(scale);
-
- // Perform pooling
- const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
- res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
- }
- else
- {
- const float32x2_t max_data = vmax_f32(top_data, bottom_data);
- res = vpmax_f32(max_data, max_data);
- }
- final_res = vget_lane_f32(res, 0);
-
- // Calculate square-root in case of l2 pooling
- if(pooling_type == PoolingType::L2)
- {
- final_res = sqrt(final_res);
- }
-
- // Store result
- *(reinterpret_cast<float *>(output.ptr())) = final_res;
- },
- input, output);
-}
-
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1212,21 +866,21 @@
vgetq_lane_u16(final_sum.val[1], 6),
};
- scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ scale_vector_s16x8(exclude_padding, res, id, 0, 1,
+ pool_size, upper_bound_w, upper_bound_h,
+ pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
}
else
{
// Scale lower result
- scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ scale_vector_s16x8(exclude_padding, final_sum.val[0], id, 0, 1,
+ pool_size, upper_bound_w, upper_bound_h,
+ pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
// Scale lower result
- scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ scale_vector_s16x8(exclude_padding, final_sum.val[1], id, 8, 1,
+ pool_size, upper_bound_w, upper_bound_h,
+ pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
}
@@ -1254,160 +908,10 @@
input, output);
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
-
- constexpr const int pool_size = 3;
- const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
- const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
- const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
- const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
- const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
- float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
- float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
- float32x2_t res = {};
- float final_res = 0;
-
- // Get power of 2 in case of l2 pooling
- if(pooling_type == PoolingType::L2)
- {
- top_data = vmulq_f32(top_data, top_data);
- middle_data = vmulq_f32(middle_data, middle_data);
- bottom_data = vmulq_f32(bottom_data, bottom_data);
- }
-
- if(pooling_type != PoolingType::MAX)
- {
- // Calculate scale
- float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- const float32x2_t scale_v = vdup_n_f32(scale);
-
- // Perform pooling
- const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
- res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
- res = vmul_f32(vpadd_f32(res, res), scale_v);
- }
- else
- {
- const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
- res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
- res = vpmax_f32(res, res);
- }
- final_res = vget_lane_f32(res, 0);
-
- // Calculate square-root in case of l2 pooling
- if(pooling_type == PoolingType::L2)
- {
- final_res = sqrt(final_res);
- }
-
- // Store result
- *(reinterpret_cast<float *>(output.ptr())) = final_res;
- },
- input, output);
-}
-
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window)
-{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
-
- constexpr const int pool_size = 7;
- const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
- const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
- const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
- const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
- std::array<const uint8_t *, pool_size> input_ptrs{ {} };
- for(int i = 0; i < pool_size; ++i)
- {
- input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
- }
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float32x2_t res = {};
- float final_res = 0.f;
- if(pooling_type != PoolingType::MAX)
- {
- // Calculate scale
- float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- const float32x2_t scale_v = vdup_n_f32(scale);
-
- // Perform pooling
- float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
- // Get power of 2 in case of l2 pooling
- if(pooling_type == PoolingType::L2)
- {
- data.val[0] = vmulq_f32(data.val[0], data.val[0]);
- data.val[1] = vmulq_f32(data.val[1], data.val[1]);
- }
- float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
- for(int i = 1; i < pool_size; ++i)
- {
- data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
- // Get power of 2 in case of l2 pooling
- if(pooling_type == PoolingType::L2)
- {
- data.val[0] = vmulq_f32(data.val[0], data.val[0]);
- data.val[1] = vmulq_f32(data.val[1], data.val[1]);
- }
- sum_data = vaddq_f32(sum_data, data.val[0]);
- sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
- }
- res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
- res = vmul_f32(vpadd_f32(res, res), scale_v);
- }
- else
- {
- float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
- for(int i = 1; i < pool_size; ++i)
- {
- const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
- max_data = vmax2q_f32(max_data, data);
- }
- res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
- res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
- res = vpmax_f32(res, res);
- }
- final_res = vget_lane_f32(res, 0);
-
- // Calculate square-root in case of l2 pooling
- if(pooling_type == PoolingType::L2)
- {
- final_res = sqrt(final_res);
- }
-
- // Store result
- *(reinterpret_cast<float *>(output.ptr())) = final_res;
- },
- input, output);
-}
-
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window)
-{
+ ARM_COMPUTE_UNUSED(pooling_type);
+ ARM_COMPUTE_UNUSED(exclude_padding);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1432,7 +936,7 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
// Perform pooling
@@ -1528,9 +1032,10 @@
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
+ ARM_COMPUTE_UNUSED(pooling_type);
+ ARM_COMPUTE_UNUSED(exclude_padding);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1564,8 +1069,8 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
+ const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+ pool_stride_y);
const float16x8_t scale_v = vdupq_n_f16(scale);
// Perform pooling
@@ -1625,8 +1130,7 @@
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1650,7 +1154,7 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
// Perform pooling
float32x4_t vres = vdupq_n_f32(0.0f);
@@ -1748,8 +1252,218 @@
input, output);
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ constexpr int pool_size = 2;
+ const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
+ const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
+ const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+ const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+ const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+ float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+ float32x2_t res = {};
+ float final_res = 0;
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data = vmul_f32(top_data, top_data);
+ bottom_data = vmul_f32(bottom_data, bottom_data);
+ }
+
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float32x2_t scale_v = vdup_n_f32(scale);
+
+ // Perform pooling
+ const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+ res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+ }
+ else
+ {
+ const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+ res = vpmax_f32(max_data, max_data);
+ }
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = final_res;
+ },
+ input, output);
+}
+
+void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ constexpr const int pool_size = 3;
+ const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
+ const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
+ const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+ const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+ const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+ float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
+ float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+ float32x2_t res = {};
+ float final_res = 0;
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data = vmulq_f32(top_data, top_data);
+ middle_data = vmulq_f32(middle_data, middle_data);
+ bottom_data = vmulq_f32(bottom_data, bottom_data);
+ }
+
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float32x2_t scale_v = vdup_n_f32(scale);
+
+ // Perform pooling
+ const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+ res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+ res = vmul_f32(vpadd_f32(res, res), scale_v);
+ }
+ else
+ {
+ const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+ res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
+ res = vpmax_f32(res, res);
+ }
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = final_res;
+ },
+ input, output);
+}
+
+void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ constexpr const int pool_size = 7;
+ const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
+ const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
+ const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+ const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+ std::array<const uint8_t *, pool_size> input_ptrs{ {} };
+ for(int i = 0; i < pool_size; ++i)
+ {
+ input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+ }
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float32x2_t res = {};
+ float final_res = 0.f;
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float32x2_t scale_v = vdup_n_f32(scale);
+
+ // Perform pooling
+ float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+ data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+ }
+ float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+ for(int i = 1; i < pool_size; ++i)
+ {
+ data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+ data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+ }
+ sum_data = vaddq_f32(sum_data, data.val[0]);
+ sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+ }
+ res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+ res = vmul_f32(vpadd_f32(res, res), scale_v);
+ }
+ else
+ {
+ float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+ for(int i = 1; i < pool_size; ++i)
+ {
+ const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+ max_data = vmax2q_f32(max_data, data);
+ }
+ res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
+ res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
+ res = vpmax_f32(res, res);
+ }
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = final_res;
+ },
+ input, output);
+}
+
+void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1783,8 +1497,8 @@
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
+ const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+ pool_stride_y);
const float32x4_t scale_v = vdupq_n_f32(scale);
// Perform pooling
@@ -1837,8 +1551,7 @@
input, output);
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1865,7 +1578,7 @@
uint32_t sres = 0;
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
// Perform pooling
for(int y = 0; y < pool_size_y; ++y)
@@ -1933,8 +1646,7 @@
input, output);
}
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -1973,8 +1685,8 @@
uint32x4_t vres4 = vdupq_n_u32(0);
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
+ const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+ pool_stride_y);
const float32x4_t scale_v = vdupq_n_f32(scale);
// Perform pooling
@@ -2073,9 +1785,10 @@
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
- const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
- const unsigned int pool_size = _pool_info.pool_size().width;
+ const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
+ const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
+ const unsigned int pool_size = _pool_info.pool_size().width;
+ const bool exclude_padding = _pool_info.exclude_padding();
Window window_input(window);
if(_input->info()->data_layout() == DataLayout::NCHW)
@@ -2093,6 +1806,7 @@
}
break;
}
+
case DataType::F16:
case DataType::F32:
{
@@ -2115,5 +1829,5 @@
}
// Run function
- (this->*_func)(window_input, window);
+ (this->*_func)(window_input, window, _pool_info.pool_type(), exclude_padding);
}
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 2f63179..365fc83 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -67,8 +67,7 @@
if(output != nullptr && output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
}
return Status{};
@@ -76,29 +75,13 @@
std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info)
{
- ARM_COMPUTE_UNUSED(input2);
+ ARM_COMPUTE_UNUSED(input1, input2);
- Window win = {};
- bool window_changed = false;
- switch(input1->data_layout())
- {
- case DataLayout::NCHW:
- {
- const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
- const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, output_access);
- break;
- }
- case DataLayout::NHWC:
- {
- win = calculate_max_window(*output, Steps());
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- };
+ const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+ const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, output_access);
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
@@ -106,11 +89,10 @@
} // namespace
NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
- : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
{
}
-template <DataLayout DL>
void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
const int height)
{
@@ -119,49 +101,23 @@
float xmax = (center_x + box_width / 2.f) / width;
float ymax = (center_y + box_height / 2.f) / height;
- switch(DL)
+ float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
+ if(_info.clip())
{
- case DataLayout::NCHW:
- {
- float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
- if(_info.clip())
- {
- static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
- static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
- vec_elements = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
- }
- vst1q_f32(out + offset, vec_elements);
- }
- break;
- case DataLayout::NHWC:
- {
- const int output_offset = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
- if(_info.clip())
- {
- xmin = std::min(std::max(xmin, 0.f), 1.f);
- ymin = std::min(std::max(ymin, 0.f), 1.f);
- xmax = std::min(std::max(xmax, 0.f), 1.f);
- ymax = std::min(std::max(ymax, 0.f), 1.f);
- }
-
- *(out + output_offset * offset) = xmin;
- *(out + output_offset * (offset + 1)) = ymin;
- *(out + output_offset * (offset + 2)) = xmax;
- *(out + output_offset * (offset + 3)) = ymax;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
+ static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+ static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+ vec_elements = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
}
+ vst1q_f32(out + offset, vec_elements);
}
-template <DataLayout DL>
void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
{
const int num_priors = _info.aspect_ratios().size() * _info.min_sizes().size() + _info.max_sizes().size();
- const int width_idx = get_data_layout_dimension_index(DL, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(DL, DataLayoutDimension::HEIGHT);
+ const DataLayout data_layout = _input1->info()->data_layout();
+ const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const int layer_width = _input1->info()->dimension(width_idx);
const int layer_height = _input1->info()->dimension(height_idx);
@@ -182,44 +138,17 @@
step_y = static_cast<float>(img_height) / layer_height;
}
- Window slice = {};
-
- switch(DL)
- {
- case DataLayout::NCHW:
- slice = window.first_slice_window_2D();
- slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
- break;
- case DataLayout::NHWC:
- slice = window.first_slice_window_3D();
- slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * num_priors));
- slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
+ Window slice = window.first_slice_window_2D();
+ slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
Iterator output(_output, slice);
execute_window_loop(slice, [&](const Coordinates & id)
{
float center_x = 0;
float center_y = 0;
- int idx = 0;
- switch(DL)
- {
- case DataLayout::NCHW:
- idx = id.x() / (4 * num_priors);
- center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
- center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
- break;
- case DataLayout::NHWC:
- idx = id.y() / (4 * num_priors);
- center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
- center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
+ int idx = id.x() / (4 * num_priors);
+ center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+ center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
float box_width;
float box_height;
@@ -231,7 +160,7 @@
const float min_size = _info.min_sizes().at(i);
box_width = min_size;
box_height = min_size;
- store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
offset += 4;
if(!_info.max_sizes().empty())
@@ -240,7 +169,7 @@
box_width = std::sqrt(min_size * max_size);
box_height = box_width;
- store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
offset += 4;
}
@@ -255,50 +184,27 @@
box_width = min_size * sqrt(ar);
box_height = min_size / sqrt(ar);
- store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
offset += 4;
}
}
// set the variance
- switch(DL)
+ out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+ float32x4_t var;
+ if(_info.variances().size() == 1)
{
- case DataLayout::NCHW:
- {
- out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
- float32x4_t var;
- if(_info.variances().size() == 1)
- {
- var = vdupq_n_f32(_info.variances().at(0));
- }
- else
- {
- const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
- var = vars;
- }
- for(int i = 0; i < num_priors; ++i)
- {
- vst1q_f32(out + 4 * i, var);
- }
- }
- break;
- case DataLayout::NHWC:
- {
- for(int i = 0; i < num_priors; ++i)
- {
- const int prior_offset = 4 * i;
- const bool single_var = _info.variances().size() == 1;
- *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 0, 1)))) = _info.variances().at(0);
- *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 1, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(1);
- *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 2, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(2);
- *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 3, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(3);
- }
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
+ var = vdupq_n_f32(_info.variances().at(0));
}
-
+ else
+ {
+ const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
+ var = vars;
+ }
+ for(int i = 0; i < num_priors; ++i)
+ {
+ vst1q_f32(out + 4 * i, var);
+ }
},
output);
}
@@ -314,22 +220,6 @@
_info = info;
_output = output;
- switch(input1->info()->data_layout())
- {
- case DataLayout::NCHW:
- {
- _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NCHW>;
- break;
- }
- case DataLayout::NHWC:
- {
- _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NHWC>;
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not implemented.");
- }
-
// Configure kernel window
auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info(), info);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
@@ -350,9 +240,8 @@
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
// Run function
- (this->*_func)(window);
+ calculate_prior_boxes(window);
}
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 4d908db..b8d20f6 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
@@ -35,22 +36,36 @@
#include <cfloat>
#include <cmath>
-using namespace arm_compute;
-
+namespace arm_compute
+{
NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
: _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
{
}
-void NEROIPoolingLayerKernel::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
- ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
- // Output auto inizialitation if not yet initialized
- TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
+ //Validate arguments
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), rois->info(), output->info());
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::U16);
+ ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
+ ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
+ ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+
+ if(output->info()->total_size() != 0)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(rois->info()->dimension(1) != output->info()->dimension(3));
+ }
+
+ // Output auto initialization if not yet initialized
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -64,7 +79,7 @@
// Configure kernel window
Window window;
- window.set(Window::DimX, Window::Dimension(0, rois->num_values()));
+ window.set(Window::DimX, Window::Dimension(0, rois->info()->dimension(1)));
window.set(Window::DimY, Window::Dimension(0, 1));
AccessWindowStatic input_access(input->info(),
@@ -85,6 +100,8 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ const size_t values_per_roi = _rois->info()->dimension(0);
+
const int roi_list_start = window.x().start();
const int roi_list_end = window.x().end();
const int width = _input->info()->dimension(Window::DimX);
@@ -94,16 +111,21 @@
const int pooled_h = _pool_info.pooled_height();
const float spatial_scale = _pool_info.spatial_scale();
+ const auto *rois_ptr = reinterpret_cast<const uint16_t *>(_rois->buffer());
+
for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
{
- const ROI &curr_roi = _rois->at(roi_indx);
+ const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
+ const auto x1 = rois_ptr[values_per_roi * roi_indx + 1];
+ const auto y1 = rois_ptr[values_per_roi * roi_indx + 2];
+ const auto x2 = rois_ptr[values_per_roi * roi_indx + 3];
+ const auto y2 = rois_ptr[values_per_roi * roi_indx + 4];
// Scale ROI
- const int roi_batch = curr_roi.batch_idx;
- const int roi_anchor_x = support::cpp11::round(curr_roi.rect.x * spatial_scale);
- const int roi_anchor_y = support::cpp11::round(curr_roi.rect.y * spatial_scale);
- const int roi_width = std::max(support::cpp11::round(curr_roi.rect.width * spatial_scale), 1.f);
- const int roi_height = std::max(support::cpp11::round(curr_roi.rect.height * spatial_scale), 1.f);
+ const int roi_anchor_x = support::cpp11::round(x1 * spatial_scale);
+ const int roi_anchor_y = support::cpp11::round(y1 * spatial_scale);
+ const int roi_width = std::max(support::cpp11::round((x2 - x1) * spatial_scale), 1.f);
+ const int roi_height = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
// Iterate through all feature maps
for(int fm = 0; fm < fms; ++fm)
@@ -146,3 +168,4 @@
}
}
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
new file mode 100644
index 0000000..189e77f
--- /dev/null
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NERangeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include "arm_compute/core/Utils.h"
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+void range_function(ITensor *output, float start, float step, const Window &window)
+{
+ const unsigned int num_elems_processed_per_iteration = 16 / sizeof(T);
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
+
+ const auto step_vec = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
+ const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
+ auto id_vec = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+ Iterator output_it(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ for(unsigned int count = 0; count < num_elems_processed_per_iteration; ++count)
+ {
+ id_vec = wrapper::vsetlane(static_cast<T>(id.x() + count), id_vec, count);
+ }
+ // start + step * id
+ const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
+ const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+ wrapper::vstore(out_ptr, res_vec);
+ },
+ output_it);
+}
+
+Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output,
+ 1,
+ DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo &output, const float start, const float end, const float step)
+{
+ const unsigned int num_elems_processed_per_iteration = 16 / output.element_size();
+
+ // Auto initialize output if not initialized
+ auto_init_if_empty(output, TensorShape(num_of_elements_in_range(start, end, step)), 1, output.data_type(), output.quantization_info());
+
+ // Configure kernel window
+ Window win = calculate_max_window(output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), TensorShape(num_of_elements_in_range(start, end, step))));
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+NERangeKernel::NERangeKernel()
+ : _func(nullptr), _start(0), _end(1), _step(1), _output(nullptr)
+{
+}
+
+void NERangeKernel::configure(ITensor *output, float start, float end, float step)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(*(output->info()), start, end, step);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _start = start;
+ _end = end;
+ _step = step;
+ _output = output;
+ switch(_output->info()->data_type())
+ {
+ case DataType::U8:
+ _func = &range_function<uint8_t>;
+ break;
+ case DataType::U16:
+ _func = &range_function<uint16_t>;
+ break;
+ case DataType::U32:
+ _func = &range_function<uint32_t>;
+ break;
+ case DataType::S8:
+ _func = &range_function<int8_t>;
+ break;
+ case DataType::S16:
+ _func = &range_function<int16_t>;
+ break;
+ case DataType::S32:
+ _func = &range_function<int32_t>;
+ break;
+ case DataType::F32:
+ _func = &range_function<float>;
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ _func = &range_function<float16_t>;
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ break;
+ }
+
+ INEKernel::configure(win_config.second);
+}
+
+Status NERangeKernel::validate(const ITensorInfo *output, float start, float end, float step)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*output, start, end, step));
+ ARM_COMPUTE_RETURN_ON_ERROR((validate_and_configure_window(*(output->clone()), start, end, step)).first);
+
+ return Status{};
+}
+
+void NERangeKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ (*_func)(_output, _start, _step, window);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 182e93d..84cb223 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
@@ -31,6 +32,7 @@
#include "arm_compute/core/NEON/NEMath.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -39,11 +41,232 @@
{
namespace
{
+uint32x4x4_t calculate_index(uint32_t idx, float32x4_t a, float32x4_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+ uint32x4_t mask{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ mask = wrapper::vcgt(b, a);
+ }
+ else
+ {
+ mask = wrapper::vclt(b, a);
+ }
+
+ uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
+ if(axis != 0)
+ {
+ vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
+
+ return res;
+}
+
+uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+ uint32x4x4_t mask{ { 0 } };
+ uint8x16_t mask_u8{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ mask_u8 = wrapper::vcgt(b, a);
+ }
+ else
+ {
+ mask_u8 = wrapper::vclt(b, a);
+ }
+ auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+ auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+ mask.val[0] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+ mask.val[1] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+ mask.val[2] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+ mask.val[3] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+
+ uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+ { idx + 4, idx + 5, idx + 6, idx + 7 },
+ { idx + 8, idx + 9, idx + 10, idx + 11 },
+ { idx + 12, idx + 13, idx + 14, idx + 15 }
+ }
+ };
+ if(axis != 0)
+ {
+ vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t res =
+ {
+ {
+ vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
+ vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+ vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
+ vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
+ }
+ };
+
+ return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_value, ReductionOperation op)
+{
+ uint32x4_t res_idx_mask{ 0 };
+ uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmin = wrapper::vpmin(pmin, pmin);
+ auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+ res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+ }
+ else
+ {
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmax = wrapper::vpmax(pmax, pmax);
+ auto mask = vceqq_f32(vec_res_value, wrapper::vcombine(pmax, pmax));
+ res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+ }
+
+ res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones);
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask));
+ pmin = wrapper::vpmin(pmin, pmin);
+ uint32_t res = wrapper::vgetlane(pmin, 0);
+
+ return (res - 0xFFFFFFFF);
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, uint8x16_t vec_res_value, ReductionOperation op)
+{
+ uint32x4x4_t res_idx_mask{ { 0 } };
+ uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+ uint8x16_t mask_u8{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+ }
+ else
+ {
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+ }
+
+ // Widen vectors
+ auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+ auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+ auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+ auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+ auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+ auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+ res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+ res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+ res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
+ res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4);
+ res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+ res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+ res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones);
+ res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones);
+
+ uint32_t res = 0xFFFFFFFF;
+ int iter = 0;
+ do
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+ pmin = wrapper::vpmin(pmin, pmin);
+ res = std::min(wrapper::vgetlane(pmin, 0), res);
+ iter++;
+ }
+ while(iter < 4);
+
+ return (res - 0xFFFFFFFF);
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+ uint32x4x2_t mask{ 0 };
+ uint16x8_t mask_u16{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ mask_u16 = wrapper::vcgt(b, a);
+ }
+ else
+ {
+ mask_u16 = wrapper::vclt(b, a);
+ }
+ mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16));
+ mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16));
+ uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+ { idx + 4, idx + 5, idx + 6, idx + 7 }
+ }
+ };
+ if(axis != 0)
+ {
+ vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+ wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
+ 0, 0
+ };
+
+ return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
+{
+ uint32x4x2_t res_idx_mask{ 0 };
+ uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+ uint16x8_t mask_u16;
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+ }
+ else
+ {
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+ }
+
+ // Widen vectors
+ auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+ auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+ res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+ res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+ res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+ res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+
+ uint32_t res = 0xFFFFFFFF;
+ int iter = 0;
+ do
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+ pmin = wrapper::vpmin(pmin, pmin);
+ res = std::min(wrapper::vgetlane(pmin, 0), res);
+ iter++;
+ }
+ while(iter < 2);
+
+ return (res - 0xFFFFFFFF);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
template <class F>
class Reducer
{
public:
- static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set out window
Window out_window(window);
@@ -58,51 +281,55 @@
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info());
+ f(in, out, in_slice, out_slice, *input->info(), op);
}
while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
}
- static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set in window
Window in_window(window);
+ Window out_window(window);
in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+ out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
// Get first input and output slices
Window in_slice = in_window.first_slice_window_2D();
- Window out_slice = window.first_slice_window_2D();
+ Window out_slice = out_window.first_slice_window_2D();
do
{
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info(), 1);
+ f(in, out, in_slice, out_slice, *input->info(), 1, op);
}
- while(in_window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
}
- static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set in window
Window in_window(window);
+ Window out_window(window);
in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
// Get first input and output slices
Window in_slice = in_window.first_slice_window_3D();
- Window out_slice = window.first_slice_window_3D();
+ Window out_slice = out_window.first_slice_window_3D();
do
{
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info(), 2);
+ f(in, out, in_slice, out_slice, *input->info(), 2, op);
}
- while(in_window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
}
- static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set in/out window
Window in_window(window);
@@ -120,115 +347,278 @@
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info(), 3);
+ f(in, out, in_slice, out_slice, *input->info(), 3, op);
}
while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice));
}
};
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
struct RedOpX
{
/** NEON vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
- auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ auto init_res_value = static_cast<T>(0.f);
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ init_res_value = *reinterpret_cast<T *>(input.ptr());
+ }
+ else if(op == ReductionOperation::PROD)
+ {
+ init_res_value = static_cast<T>(1.f);
+ }
+ auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+ uint32x4x4_t vec_res_idx{ { 0 } };
execute_window_loop(in_slice, [&](const Coordinates & id)
{
const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
const auto vec_elements = wrapper::vloadq(in_ptr);
- if(op == ReductionOperation::SUM_SQUARE)
+ switch(op)
{
- vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
- }
- else
- {
- vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::MEAN_SUM:
+ case ReductionOperation::SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::PROD:
+ vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
},
input);
- auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value));
- for(int i = 0; i < S / 4; ++i)
+ switch(op)
{
- carry_addition = wrapper::vpadd(carry_addition, carry_addition);
- }
+ case ReductionOperation::SUM:
+ case ReductionOperation::SUM_SQUARE:
+ case ReductionOperation::MEAN_SUM:
+ {
+ auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ for(int i = 0; i < S / 4; ++i)
+ {
+ carry_res = wrapper::vpadd(carry_res, carry_res);
+ }
+ auto res = wrapper::vgetlane(carry_res, 0);
- auto res = wrapper::vgetlane(carry_addition, 0);
- if(op == ReductionOperation::MEAN_SUM)
- {
- res /= in_info.dimension(0);
- }
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ res /= in_info.dimension(0);
+ }
- *(reinterpret_cast<T *>(output.ptr())) = res;
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ T res = 1;
+ for(int i = 0; i < S / 2; ++i)
+ {
+ res *= wrapper::vgetlane(carry_res, i);
+ }
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto res = calculate_vector_index(vec_res_idx, vec_res_value, op);
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
};
-template <ReductionOperation op>
struct RedOpX_qasymm8
{
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
- auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
+ auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
+ auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
+ auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+
+ uint8x16_t vec_res_value = { 0 };
+
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
+ }
+
+ uint32x4x4_t vec_res_idx{ { 0 } };
execute_window_loop(in_slice, [&](const Coordinates & id)
{
const auto vec_elements = wrapper::vloadq(input.ptr());
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ const auto offset32x4f_4 = vdupq_n_f32(in_info.quantization_info().offset);
+ const auto scale32x4f_4 = vdupq_n_f32(in_info.quantization_info().scale);
- vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
- vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
- vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
- vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+ const auto temp16x8t_1 = vmovl_u8(vget_low_u8(vec_elements));
+ const auto temp16x8t_2 = vmovl_u8(vget_high_u8(vec_elements));
+
+ const auto temp32x4t_1 = vmovl_u16(vget_low_u16(temp16x8t_1));
+ const auto temp32x4t_2 = vmovl_u16(vget_high_u16(temp16x8t_1));
+ const auto temp32x4t_3 = vmovl_u16(vget_low_u16(temp16x8t_2));
+ const auto temp32x4t_4 = vmovl_u16(vget_high_u16(temp16x8t_2));
+
+ auto temp32x4f_1 = vcvtq_f32_u32(temp32x4t_1);
+ auto temp32x4f_2 = vcvtq_f32_u32(temp32x4t_2);
+ auto temp32x4f_3 = vcvtq_f32_u32(temp32x4t_3);
+ auto temp32x4f_4 = vcvtq_f32_u32(temp32x4t_4);
+
+ //de-quantize vec_elements
+ temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+ temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+ temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+ temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+ vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+ vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+ vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+ vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
},
input);
- auto carry_addition = wrapper::vadd(vec_sum_value1, vec_sum_value2);
- carry_addition = wrapper::vadd(carry_addition, vec_sum_value3);
- carry_addition = wrapper::vadd(carry_addition, vec_sum_value4);
-
- auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_addition), wrapper::vgetlow(carry_addition));
- carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
- auto res = wrapper::vgetlane(carry_paddition, 0);
-
- if(op == ReductionOperation::MEAN_SUM)
+ if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
{
- res /= in_info.dimension(0);
+ auto res = calculate_vector_index(vec_res_idx, vec_res_value, op);
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
}
+ else if(op == ReductionOperation::PROD)
+ {
+ auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+ carry_res = wrapper::vmul(carry_res, vec_res_value3_f);
+ carry_res = wrapper::vmul(carry_res, vec_res_value4_f);
- *(output.ptr()) = static_cast<uint8_t>(res);
+ float res = wrapper::vgetlane(carry_res, 0);
+ res *= wrapper::vgetlane(carry_res, 1);
+ res *= wrapper::vgetlane(carry_res, 2);
+ res *= wrapper::vgetlane(carry_res, 3);
+
+ //re-quantize result
+ res = sqcvt_qasymm8_f32(res, in_info.quantization_info().scale, in_info.quantization_info().offset);
+ *(output.ptr()) = static_cast<uint8_t>(res);
+ }
+ else
+ {
+ auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+ carry_res = wrapper::vadd(carry_res, vec_res_value3);
+ carry_res = wrapper::vadd(carry_res, vec_res_value4);
+
+ auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+ carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+ auto res = wrapper::vgetlane(carry_paddition, 0);
+
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ res /= in_info.dimension(0);
+ }
+
+ *(output.ptr()) = static_cast<uint8_t>(res);
+ }
}
};
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
struct RedOpYZW
{
/** NEON vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+ using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
execute_window_loop(in_slice, [&](const Coordinates & id)
{
- auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ neon_vector vec_res_value = { 0 };
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+ }
+ else if(op == ReductionOperation::PROD)
+ {
+ vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+ }
+ else
+ {
+ vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ }
+ uint32x4x4_t vec_res_idx{ { 0 } };
+
for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
T *in_ptr;
@@ -248,159 +638,215 @@
}
const auto vec_elements = wrapper::vloadq(in_ptr);
- if(op == ReductionOperation::SUM_SQUARE)
+ switch(op)
{
- vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
- }
- else
- {
- vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::PROD:
+ vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
}
if(op == ReductionOperation::MEAN_SUM)
{
auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
- vec_sum_value = wrapper::vmul(vec_sum_value, vec_width_inv);
+ vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
}
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_sum_value);
+ if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+ {
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+ }
+ else
+ {
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+ }
},
input, output);
}
};
-template <ReductionOperation op>
struct RedOpYZW_qasymm8
{
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
execute_window_loop(in_slice, [&](const Coordinates & id)
{
- auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ uint32x4x4_t vec_res_idx{ { 0 } };
+ auto vec_res_value1 = vdupq_n_u32(0);
+ auto vec_res_value2 = vdupq_n_u32(0);
+ auto vec_res_value3 = vdupq_n_u32(0);
+ auto vec_res_value4 = vdupq_n_u32(0);
+
+ auto vec_res_value1_f = vdupq_n_f32(1);
+ auto vec_res_value2_f = vdupq_n_f32(1);
+ auto vec_res_value3_f = vdupq_n_f32(1);
+ auto vec_res_value4_f = vdupq_n_f32(1);
+
+ auto vec_res_value = wrapper::vloadq(input.ptr());
+
+ for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
{
uint8_t *in_ptr;
switch(axis)
{
case 1:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim));
+ in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
break;
case 2:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim));
+ in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
break;
case 3:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim));
+ in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
break;
default:
ARM_COMPUTE_ERROR("Not supported");
}
const auto vec_elements = wrapper::vloadq(in_ptr);
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
- vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
- vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
- vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
- vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ const auto offset32x4f_4 = vdupq_n_f32(in_info.quantization_info().offset);
+ const auto scale32x4f_4 = vdupq_n_f32(in_info.quantization_info().scale);
+
+ const auto temp16x8t_1 = vmovl_u8(vget_low_u8(vec_elements));
+ const auto temp16x8t_2 = vmovl_u8(vget_high_u8(vec_elements));
+
+ const auto temp32x4t_1 = vmovl_u16(vget_low_u16(temp16x8t_1));
+ const auto temp32x4t_2 = vmovl_u16(vget_high_u16(temp16x8t_1));
+ const auto temp32x4t_3 = vmovl_u16(vget_low_u16(temp16x8t_2));
+ const auto temp32x4t_4 = vmovl_u16(vget_high_u16(temp16x8t_2));
+
+ auto temp32x4f_1 = vcvtq_f32_u32(temp32x4t_1);
+ auto temp32x4f_2 = vcvtq_f32_u32(temp32x4t_2);
+ auto temp32x4f_3 = vcvtq_f32_u32(temp32x4t_3);
+ auto temp32x4f_4 = vcvtq_f32_u32(temp32x4t_4);
+
+ //de-quantize vec_elements
+ temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+ temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+ temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+ temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+ vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+ vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+ vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+ vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
if(op == ReductionOperation::MEAN_SUM)
{
- const auto vec_width_inv = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
- const auto vec_sum_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value1), vec_width_inv);
- const auto vec_sum_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value2), vec_width_inv);
- const auto vec_sum_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value3), vec_width_inv);
- const auto vec_sum_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value4), vec_width_inv);
+ const auto vec_width_inv = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
+ vec_res_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value1), vec_width_inv);
+ vec_res_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value2), vec_width_inv);
+ vec_res_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value3), vec_width_inv);
+ vec_res_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value4), vec_width_inv);
- vec_sum_value1 = vcvtq_u32_f32(vec_sum_value1_f);
- vec_sum_value2 = vcvtq_u32_f32(vec_sum_value2_f);
- vec_sum_value3 = vcvtq_u32_f32(vec_sum_value3_f);
- vec_sum_value4 = vcvtq_u32_f32(vec_sum_value4_f);
+ vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f);
+ vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f);
+ vec_res_value3 = vcvtq_u32_f32(vec_res_value3_f);
+ vec_res_value4 = vcvtq_u32_f32(vec_res_value4_f);
+ }
+ else if(op == ReductionOperation::PROD)
+ {
+ const auto offset32x4f_4 = vdupq_n_f32(in_info.quantization_info().offset);
+ const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(in_info.quantization_info().scale));
+
+ //re-quantize
+ vec_res_value1_f = vaddq_f32(vmulq_f32(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
+ vec_res_value2_f = vaddq_f32(vmulq_f32(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
+ vec_res_value3_f = vaddq_f32(vmulq_f32(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
+ vec_res_value4_f = vaddq_f32(vmulq_f32(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
+
+ vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f);
+ vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f);
+ vec_res_value3 = vcvtq_u32_f32(vec_res_value3_f);
+ vec_res_value4 = vcvtq_u32_f32(vec_res_value4_f);
}
- const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_sum_value1), wrapper::vqmovn(vec_sum_value2));
- const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_sum_value3), wrapper::vqmovn(vec_sum_value4));
- auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
- wrapper::vstore(output.ptr(), res);
+ if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+ {
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vec_res_idx.val[1]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vec_res_idx.val[2]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vec_res_idx.val[3]);
+ }
+ else
+ {
+ const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+ const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+ auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+ wrapper::vstore(output.ptr(), res);
+ }
+
},
input, output);
}
};
-void reduce_sumsq(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
- switch(axis)
- {
- case 0:
- switch(input->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpX<float, 4, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 1:
- switch(input->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 2:
- switch(input->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 3:
- switch(input->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction axis");
- }
-}
-
-void reduce_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
{
switch(axis)
{
@@ -408,13 +854,13 @@
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpX_qasymm8<ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::SUM>());
+ return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM>());
+ return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpX<float, 4, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM>());
+ return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -422,13 +868,13 @@
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+ return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+ return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -436,13 +882,13 @@
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+ return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+ return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -450,13 +896,13 @@
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+ return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+ return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -464,84 +910,13 @@
ARM_COMPUTE_ERROR("Unsupported reduction axis");
}
}
-void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
- switch(axis)
- {
- case 0:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpX_qasymm8<ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::MEAN_SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpX<float, 4, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::MEAN_SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 1:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 2:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 3:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction axis");
- }
-}
-
-TensorShape calculate_output_shape(const TensorShape &input_shape, unsigned int axis)
-{
- TensorShape output_shape{ input_shape };
- output_shape.set(axis, 1);
-
- return output_shape;
-}
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_UNUSED(op);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
@@ -549,10 +924,19 @@
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+ if(!is_arg_min_max)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
}
@@ -560,13 +944,15 @@
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
// Calculate output shape and set if empty
- const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output, output_shape, 1, input->data_type());
+ const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+ DataType output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
+ auto_init_if_empty(*output, output_shape, 1, output_data_type, input->quantization_info());
unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
@@ -609,7 +995,7 @@
_reduction_axis = axis;
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+ auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
@@ -619,7 +1005,7 @@
Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
return Status{};
}
@@ -630,19 +1016,6 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_op)
- {
- case ReductionOperation::SUM_SQUARE:
- reduce_sumsq(window, _input, _output, _reduction_axis);
- break;
- case ReductionOperation::MEAN_SUM:
- reduce_mean_sum(window, _input, _output, _reduction_axis);
- break;
- case ReductionOperation::SUM:
- reduce_sum(window, _input, _output, _reduction_axis);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction operation.");
- }
+ reduce_op(window, _input, _output, _reduction_axis, _op);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index c718991..649fba3 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,11 +43,13 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != output->tensor_shape().total_size());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
return Status{};
}
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
new file mode 100644
index 0000000..62e4882
--- /dev/null
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/QAsymm8.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <array>
+#include <cmath>
+#include <map>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+NEReverseKernel::NEReverseKernel()
+ : _input(nullptr), _output(nullptr), _axis(nullptr)
+{
+}
+
+void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis->info()));
+
+ // Configure kernel window
+ INEKernel::configure(calculate_max_window(*output->info()));
+}
+
+Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis));
+
+ return Status{};
+}
+
+template <typename T>
+void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output)
+{
+ int axis_bit = 0;
+ for(unsigned int i = 0; i < axis->info()->dimension(0); ++i)
+ {
+ const int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
+ axis_bit |= 1 << axis_i;
+ }
+
+ // Check if we need a left-over loop for the y dimension
+ const int window_step_x = 16 / input->info()->element_size();
+ const int window_start_x = window.x().start();
+ const int window_end_x = std::min(window.x().end(), static_cast<int>(input->info()->dimension(0)));
+ const int window_end_x_multiple_of = ((window_end_x - window_start_x) / window_step_x) * window_step_x;
+ bool left_over_loop_x = (((window_end_x - window_start_x) % window_step_x) != 0);
+
+ Window slice = window.first_slice_window_4D();
+
+ if(left_over_loop_x)
+ {
+ // Check if window_end_y_multiple_of is greater than window_start_y
+ if(window_end_x_multiple_of > window_start_x)
+ {
+ slice.set(Window::DimX, Window::Dimension(window_start_x, window_end_x_multiple_of, window_step_x));
+ }
+ else
+ {
+ slice.set(Window::DimX, Window::Dimension(0, 0, 1));
+ }
+ }
+
+ do
+ {
+ Iterator input_it(input, slice);
+ execute_window_loop(slice, [&](const Coordinates & id)
+ {
+ auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()));
+
+ // Reverse 0 axis
+ if(axis_bit & 0x1)
+ {
+ in = wrapper::vrev64(in);
+ in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ }
+
+ const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - id.x() - window_step_x : id.x();
+ const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+ const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+ const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+ auto out_ptr = reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
+ wrapper::vstore(out_ptr, in);
+ },
+ input_it);
+
+ if(left_over_loop_x)
+ {
+ slice.set(Window::DimX, Window::Dimension(window_end_x_multiple_of, window_end_x, 1));
+
+ Iterator input_it(input, slice);
+
+ // Compute left-over elements along the y dimension (1x1)
+ execute_window_loop(slice, [&](const Coordinates & id)
+ {
+ const auto in = *reinterpret_cast<T *>(input_it.ptr());
+
+ const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - id.x() - 1 : id.x();
+ const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+ const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+ const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+ *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in;
+ },
+ input_it);
+ }
+
+ }
+ while(window.slide_window_slice_4D(slice));
+}
+
+void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch(_input->info()->data_type())
+ {
+ case DataType::F32:
+ run_reverse<float>(window, _input, _axis, _output);
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ run_reverse<float16_t>(window, _input, _axis, _output);
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::U32:
+ run_reverse<uint32_t>(window, _input, _axis, _output);
+ break;
+ case DataType::S32:
+ run_reverse<int32_t>(window, _input, _axis, _output);
+ break;
+ case DataType::S16:
+ run_reverse<int16_t>(window, _input, _axis, _output);
+ break;
+ case DataType::U16:
+ run_reverse<uint16_t>(window, _input, _axis, _output);
+ break;
+ case DataType::QASYMM8:
+ case DataType::U8:
+ run_reverse<uint8_t>(window, _input, _axis, _output);
+ break;
+ case DataType::S8:
+ run_reverse<int8_t>(window, _input, _axis, _output);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ }
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 7111644..3d300ef 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -46,11 +47,12 @@
const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
BorderMode border_mode, SamplingPolicy sampling_policy)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(output == input);
- ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+ ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
ARM_COMPUTE_UNUSED(border_mode);
const DataLayout data_layout = input->data_layout();
@@ -72,6 +74,7 @@
if(policy == InterpolationPolicy::AREA)
{
ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
}
return Status{};
@@ -182,7 +185,7 @@
template <typename T>
inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output,
- float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
+ float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
{
Iterator in(input, win_in);
Iterator out(output, window);
@@ -202,12 +205,16 @@
int border_size = (border_mode == BorderMode::UNDEFINED) ? 0 : 1;
+ const bool is_quantized = (input->info()->data_type() == DataType::QASYMM8);
+ const QuantizationInfo iq_info = input->info()->quantization_info();
+ const QuantizationInfo oq_info = output->info()->quantization_info();
+
execute_window_loop(window, [&](const Coordinates & id)
{
const auto offset = (*reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())))) / static_cast<int>(sizeof(T));
const auto dx_scale = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
const auto dy_scale = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_yi = std::floor((id.z() + 0.5f) * hr - 0.5f);
+ const int in_yi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
const int offset_row = in_yi * stride_h + id.x() * stride_c;
const T *in_ptr = reinterpret_cast<T *>(in.ptr() + offset * stride_w + offset_row);
@@ -251,8 +258,22 @@
const float w3 = dx1 * dy_scale;
const float w4 = dx_scale * dy_scale;
+ T res = 0;
+ //dequantize quantized input
+ if(is_quantized)
+ {
+ float inp00 = iq_info.dequantize(a00);
+ float inp01 = iq_info.dequantize(a01);
+ float inp10 = iq_info.dequantize(a10);
+ float inp11 = iq_info.dequantize(a11);
+ res = static_cast<T>(oq_info.quantize((inp00 * w1 + inp01 * w2 + inp10 * w3 + inp11 * w4), RoundingPolicy::TO_NEAREST_UP));
+ }
+ else
+ {
+ res = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+ }
// Store result
- *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+ *reinterpret_cast<T *>(out.ptr()) = res;
}
else
{
@@ -273,7 +294,7 @@
} // namespace
NEScaleKernel::NEScaleKernel()
- : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode()
+ : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _sampling_offset(0)
{
}
@@ -309,6 +330,11 @@
_border_size = BorderSize(1);
_border_mode = border_mode;
+ if(sampling_policy == SamplingPolicy::CENTER)
+ {
+ _sampling_offset = 0.5f;
+ }
+
// Compute the ratio between source width/height and destination width/height
const auto wr = static_cast<float>(input->info()->dimension(idx_width)) / static_cast<float>(output->info()->dimension(idx_width));
const auto hr = static_cast<float>(input->info()->dimension(idx_height)) / static_cast<float>(output->info()->dimension(idx_height));
@@ -387,6 +413,7 @@
switch(_input->info()->data_type())
{
+ case DataType::QASYMM8:
case DataType::U8:
{
uint8x16_t tmp = vdupq_n_u8(0);
@@ -463,6 +490,48 @@
in, offsets, out);
break;
}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ float16x8x2_t tmp =
+ {
+ {
+ vdupq_n_f16(0),
+ vdupq_n_f16(0)
+ }
+ };
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+ const int in_yi = (id.y() + 0.5f) * hr;
+ const int offset_row = in_yi * input_stride;
+
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
+ tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
+
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
+ tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
+
+ vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
+ },
+ in, offsets, out);
+ break;
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
float32x4x4_t tmp =
@@ -515,7 +584,7 @@
void NEScaleKernel::scale_bilinear_nchw(const Window &window)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
// Compute the ratio between source height and destination height
const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -545,8 +614,13 @@
const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
const size_t in_stride = in_stide_in_bytes / _input->info()->element_size();
+ const bool is_quantized = (_input->info()->data_type() == DataType::QASYMM8);
+ const QuantizationInfo iq_info = _input->info()->quantization_info();
+ const QuantizationInfo oq_info = _output->info()->quantization_info();
+
switch(_input->info()->data_type())
{
+ case DataType::QASYMM8:
case DataType::U8:
{
execute_window_loop(window, [&](const Coordinates & id)
@@ -556,29 +630,55 @@
const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
- const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
const int offset_row = in_yi * in_stide_in_bytes;
uint8x8_t tmp0 = vdup_n_u8(0);
- tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
- tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
- tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
- tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
- tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
- tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
- tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
- tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
-
+ if(is_quantized)
+ {
+ tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0], iq_info, oq_info), tmp0, 0);
+ tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1], iq_info, oq_info), tmp0, 1);
+ tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2], iq_info, oq_info), tmp0, 2);
+ tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3], iq_info, oq_info), tmp0, 3);
+ tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4], iq_info, oq_info), tmp0, 4);
+ tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5], iq_info, oq_info), tmp0, 5);
+ tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6], iq_info, oq_info), tmp0, 6);
+ tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7], iq_info, oq_info), tmp0, 7);
+ }
+ else
+ {
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+ }
uint8x8_t tmp1 = vdup_n_u8(0);
- tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
- tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
- tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
- tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
- tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
- tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
- tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
- tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
-
+ if(is_quantized)
+ {
+ tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8], iq_info, oq_info), tmp1, 0);
+ tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9], iq_info, oq_info), tmp1, 1);
+ tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10], iq_info, oq_info), tmp1, 2);
+ tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11], iq_info, oq_info), tmp1, 3);
+ tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12], iq_info, oq_info), tmp1, 4);
+ tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13], iq_info, oq_info), tmp1, 5);
+ tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14], iq_info, oq_info), tmp1, 6);
+ tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15], iq_info, oq_info), tmp1, 7);
+ }
+ else
+ {
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+ }
vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
},
in, offsets, dx, dy, out);
@@ -592,7 +692,7 @@
const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
- const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
const int offset_row = in_yi * in_stide_in_bytes;
int16x8x2_t tmp =
@@ -626,6 +726,50 @@
in, offsets, dx, dy, out);
break;
}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
+
+ const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+ const int offset_row = in_yi * in_stide_in_bytes;
+
+ float16x8x2_t tmp =
+ {
+ {
+ vdupq_n_f16(0),
+ vdupq_n_f16(0)
+ }
+ };
+
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
+ tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
+
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
+ tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
+
+ vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
execute_window_loop(window, [&](const Coordinates & id)
@@ -634,7 +778,7 @@
const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
- const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
const int offset_row = in_yi * in_stide_in_bytes;
float32x4x4_t tmp =
@@ -751,6 +895,7 @@
switch(_input->info()->data_type())
{
+ case DataType::QASYMM8:
case DataType::U8:
{
if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
@@ -759,7 +904,7 @@
}
else
{
- scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr,
+ scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
}
break;
@@ -772,11 +917,27 @@
}
else
{
- scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr,
+ scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
}
break;
}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ {
+ scale_nearest_nhwc_core<float16_t>(_input, _offsets, _output, hr,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c);
+ }
+ else
+ {
+ scale_bilinear_nhwc_core<float16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+ }
+ break;
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
@@ -785,7 +946,7 @@
}
else
{
- scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr,
+ scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
}
break;
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
new file mode 100644
index 0000000..f2697bc
--- /dev/null
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "utils/TypePrinter.h"
+
+#include <arm_neon.h>
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename ScalarType, typename VectorType>
+void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
+{
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator condition(cond, win);
+ Iterator input1(in1, win);
+ Iterator input2(in2, win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
+ const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+ int x = window_start_x;
+ for(; x <= limit; x += window_step_x)
+ {
+ const auto c = (*condition_conversion)(condition_ptr + x);
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
+ }
+ for(; x < window_end_x; ++x)
+ {
+ const auto c = *(condition_ptr + x);
+ const auto a = *(input1_ptr + x);
+ const auto b = *(input2_ptr + x);
+ *(output_ptr + x) = static_cast<bool>(c) ? a : b;
+ }
+ },
+ condition, input1, input2, output);
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ const auto window_step_x = 16 / sizeof(ScalarType);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
+ {
+ static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+ return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
+ });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ const auto window_step_x = 16 / sizeof(ScalarType);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
+ {
+ static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+ return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+ });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ const auto window_step_x = 16 / sizeof(ScalarType);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
+ {
+ static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+ return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+ });
+}
+
+template <typename ScalarType>
+void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ ARM_COMPUTE_UNUSED(window);
+
+ auto output_ptr = reinterpret_cast<ScalarType *>(out->buffer());
+ const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
+ const auto input1_ptr = reinterpret_cast<const ScalarType *>(in1->buffer());
+ const auto input2_ptr = reinterpret_cast<const ScalarType *>(in2->buffer());
+
+ const int outer_size = cond->info()->total_size() / cond->info()->element_size();
+ const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
+ int offset = 0;
+ const int step = 16 / in1->info()->element_size();
+
+ for(int i = 0; i < outer_size; ++i)
+ {
+ int x = offset;
+ const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
+ for(; x <= offset + inner_size - step; x += step)
+ {
+ wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
+ }
+ if(x <= offset + inner_size - (step / 2))
+ {
+ wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
+ x += step / 2;
+ }
+ for(; x < offset + inner_size; ++x)
+ {
+ *(output_ptr + x) = *(input_ptr + x);
+ }
+ offset += inner_size;
+ }
+}
+} // namespace
+
+NESelectKernel::NESelectKernel()
+ : _function(nullptr), _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+{
+}
+
+void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
+
+ // Auto initialize output if not initialized
+ auto_init_if_empty(*output->info(), x->info()->tensor_shape(), 1, x->info()->data_type());
+ ARM_COMPUTE_ERROR_THROW_ON(validate(c->info(), x->info(), y->info(), output->info()));
+
+ _c = c;
+ _x = x;
+ _y = y;
+ _output = output;
+ _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
+
+ std::string function_to_call("op_");
+ function_to_call += string_from_data_type(x->info()->data_type());
+
+ static std::map<std::string, SelectFunction *> map_function;
+
+ if(_has_same_rank)
+ {
+ map_function =
+ {
+ { "op_S8", &select_op_8<int8_t, uint8x16_t> },
+ { "op_S16", &select_op_16<int16_t, uint16x8_t> },
+ { "op_S32", &select_op_32<int32_t, uint32x4_t> },
+ { "op_U8", &select_op_8<uint8_t, uint8x16_t> },
+ { "op_U16", &select_op_16<uint16_t, uint16x8_t> },
+ { "op_U32", &select_op_32<uint32_t, uint32x4_t> },
+ { "op_F32", &select_op_32<float, uint32x4_t> }
+ };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ map_function["op_F16"] = &select_op_16<float16_t, uint16x8_t>;
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+ }
+ else
+ {
+ map_function =
+ {
+ { "op_S8", &select_op_not_same_rank<int8_t> },
+ { "op_S16", &select_op_not_same_rank<int16_t> },
+ { "op_S32", &select_op_not_same_rank<int32_t> },
+ { "op_U8", &select_op_not_same_rank<uint8_t> },
+ { "op_U16", &select_op_not_same_rank<uint16_t> },
+ { "op_U32", &select_op_not_same_rank<uint32_t> },
+ { "op_F32", &select_op_not_same_rank<float> }
+ };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ map_function["op_F16"] = &select_op_not_same_rank<float16_t>;
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+ }
+
+ auto it = map_function.find(function_to_call);
+
+ if(it != map_function.end())
+ {
+ _function = it->second;
+ }
+
+ Window win = calculate_max_window(x->info()->valid_region());
+ INEKernel::configure(win);
+}
+
+Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(x,
+ 1,
+ DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, y);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, y);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::U8);
+
+ const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+
+ if(output != nullptr && output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
+ }
+
+ return Status{};
+}
+
+void NESelectKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_function == nullptr);
+ _function(_c, _x, _y, _output, window);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 0f416de..e9417ec 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -34,7 +34,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Utility.h"
+#include "arm_compute/core/utils/misc/SaturateCast.h"
#include <algorithm>
#include <arm_neon.h>
@@ -667,7 +667,7 @@
/* Run remaining elements */
for(; i < input_width; ++i)
{
- out_ptr[i] = utility::saturate_cast<qasymm8_t>(tmp_ptr[i] * sum_inversed);
+ out_ptr[i] = utils::cast::saturate_cast<qasymm8_t>(tmp_ptr[i] * sum_inversed);
}
}
},
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
new file mode 100644
index 0000000..0c33f36
--- /dev/null
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEStackLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(idx_input >= num_tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+{
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input);
+
+ return std::make_pair(Status{}, win);
+}
+
+inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+{
+ constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
+ Coordinates id_out = id;
+ for(unsigned int i = max_out_coord - 1; i > axis; --i)
+ {
+ id_out.set(i, id[i - 1]);
+ }
+ id_out.set(axis, idx_input);
+ return id_out;
+}
+} // namespace
+
+NEStackLayerKernel::NEStackLayerKernel()
+ : _input(nullptr), _output(nullptr), _axis(), _idx_input(), _func(nullptr)
+{
+}
+
+void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
+
+ _input = input;
+ _output = output;
+ _axis = axis;
+ _idx_input = idx_input;
+
+ switch(input->info()->element_size())
+ {
+ case 1:
+ _func = &NEStackLayerKernel::run_stack<uint8_t>;
+ break;
+ case 2:
+ _func = &NEStackLayerKernel::run_stack<uint16_t>;
+ break;
+ case 4:
+ _func = &NEStackLayerKernel::run_stack<uint32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Element size not supported");
+ break;
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
+
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+ return Status{};
+}
+
+void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ if(_func != nullptr)
+ {
+ (this->*_func)(window);
+ }
+}
+
+template <typename T>
+void NEStackLayerKernel::run_stack(const Window &window)
+{
+ Window window_out;
+ window_out.use_tensor_dimensions(_output->info()->tensor_shape());
+
+ Iterator input(_input, window);
+ Iterator output(_output, window_out);
+
+ const int stride_x = _output->info()->strides_in_bytes()[0];
+ const int stride_y = _output->info()->num_dimensions() >= 1 ? _output->info()->strides_in_bytes()[1] : 0;
+ const int stride_z = _output->info()->num_dimensions() >= 2 ? _output->info()->strides_in_bytes()[2] : 0;
+ const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
+ const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+ const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
+ *(reinterpret_cast<T *>(output.ptr() + idx)) = *(reinterpret_cast<const T *>(input.ptr()));
+ },
+ input);
+}
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
new file mode 100644
index 0000000..2ae029b
--- /dev/null
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+ DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
+ {
+ return i == 0;
+ }));
+
+ // Get expected output shape
+ const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
+ starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
+ ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
+
+ // Checks output if configured
+ if(output->total_size() != 0)
+ {
+ const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ // Output tensor auto initialization if not yet initialized
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
+ starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+
+ // Create window
+ const unsigned int num_elems_processed_per_iteration = 1;
+
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+
+void strided_slice_generic(const ITensor *input, ITensor *output,
+ const Coordinates &starts, const BiStrides &strides, int32_t shrink_axis_mask,
+ const Window &window)
+{
+ Iterator output_it(output, window);
+ const size_t width_size = input->info()->element_size();
+
+ const bool is_shrink_w = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 0);
+ const bool is_shrink_h = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 1);
+ const bool is_shrink_c = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 2);
+ const bool is_shrink_n = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 3);
+
+ unsigned int index = 0;
+ const int idx_w = is_shrink_w ? 0 : index++;
+ const int idx_h = is_shrink_h ? 0 : index++;
+ const int idx_c = is_shrink_c ? 0 : index++;
+ const int idx_n = is_shrink_n ? 0 : index;
+
+ BiStrides shrinked_strides;
+ shrinked_strides.set(0, is_shrink_w ? 0 : strides[0]);
+ shrinked_strides.set(1, is_shrink_h ? 0 : strides[1]);
+ shrinked_strides.set(2, is_shrink_c ? 0 : strides[2]);
+ shrinked_strides.set(3, is_shrink_n ? 0 : strides[3]);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int w_coord = starts[0] + (id[idx_w] * shrinked_strides[0]);
+ const int h_coord = starts[1] + (id[idx_h] * shrinked_strides[1]);
+ const int c_coord = starts[2] + (id[idx_c] * shrinked_strides[2]);
+ const int n_coord = starts[3] + (id[idx_n] * shrinked_strides[3]);
+
+ Coordinates in_coords(w_coord, h_coord, c_coord, n_coord);
+ std::copy_n(input->ptr_to_element(in_coords), width_size, output_it.ptr());
+ },
+ output_it);
+}
+} // namespace
+
+NEStridedSliceKernel::NEStridedSliceKernel()
+ : _input(nullptr), _output(nullptr), _starts_abs(), _final_strides(), _shrink_mask()
+{
+}
+
+void NEStridedSliceKernel::configure(const ITensor *input, ITensor *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+
+ _input = input;
+ _output = output;
+ _shrink_mask = shrink_axis_mask;
+
+ const TensorShape &input_shape = input->info()->tensor_shape();
+
+ Coordinates ends_abs;
+ std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
+ input_shape,
+ starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
+ starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
+ .first);
+
+ return Status{};
+}
+
+void NEStridedSliceKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ // Dispatch kernel
+ strided_slice_generic(_input, _output, _starts_abs, _final_strides, _shrink_mask, window);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
new file mode 100644
index 0000000..dbeacfa
--- /dev/null
+++ b/src/core/NEON/kernels/NETileKernel.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
+ {
+ return e == 0;
+ }));
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+NETileKernel::NETileKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void NETileKernel::configure(const ITensor *input, ITensor *output, const Multiples &multiples)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Auto initialize output
+ TensorShape tiled_shape = misc::shape_calculator::compute_tiled_shape(input->info()->tensor_shape(), multiples);
+ auto_init_if_empty(*output->info(), tiled_shape, 1, input->info()->data_type());
+
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), multiples));
+
+ _input = input;
+ _output = output;
+
+ // Configure window without padding
+ Window win = calculate_max_window(*output->info());
+ INEKernel::configure(win);
+}
+
+Status NETileKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, multiples));
+ return Status{};
+}
+
+void NETileKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ Window output_window{ window };
+ output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0)));
+ Window out_slice = output_window.first_slice_window_1D();
+
+ const auto src_shape = _input->info()->tensor_shape();
+ do
+ {
+ Iterator output_it(_output, out_slice);
+
+ execute_window_loop(out_slice, [&](const Coordinates & id)
+ {
+ const size_t x = id.x();
+ const size_t y = id.y();
+ const size_t z = id.z();
+ const size_t w = id[3];
+ Coordinates input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] };
+ memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size());
+ },
+ output_it);
+ }
+ while(output_window.slide_window_slice_1D(out_slice));
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 870d2c9..a0a8b82 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,6 +86,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
index 5dca58e..aae85c6 100644
--- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
@@ -95,6 +96,7 @@
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
@@ -106,6 +108,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
const int num_elems_processed_per_iteration_x = 16 / input->element_size();
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 259f4fc..4a0cf27 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -120,6 +120,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, biases != nullptr));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index a84a6d9..aea6875 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -56,6 +57,7 @@
Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
DataType::U8, DataType::S8, DataType::QASYMM8,
DataType::U16, DataType::S16, DataType::F16,
@@ -110,15 +112,28 @@
uint8_t *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + _width_offset * _output->info()->strides_in_bytes()[0];
// Create iterators
- Iterator input(_input, window);
- Iterator output(_output, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+ const DataType dt = _input->info()->data_type();
+ const QuantizationInfo &input_qinfo = _input->info()->quantization_info();
+ const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+ if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo)
{
- const auto in_ptr = input.ptr();
- const auto out_ptr = output_ptr + output.offset();
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ vst1q_u8(output_ptr + output.offset(), vquantize(vdequantize(vld1q_u8(input.ptr()), input_qinfo), output_qinfo));
+ },
+ input, output);
+ }
+ else
+ {
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ const auto in_ptr = input.ptr();
+ const auto out_ptr = output_ptr + output.offset();
- wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
- },
- input, output);
+ wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
+ },
+ input, output);
+ }
}
diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
index 009562b..09a4a11 100644
--- a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,6 +44,7 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
{
ARM_COMPUTE_UNUSED(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 9194bdd..b561659 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,75 +38,60 @@
namespace arm_gemm {
-#ifdef __ARM_FEATURE_SVE
-class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
-public:
-
- UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
- return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args));
- }
-
- GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { }
-};
-
-#elif defined(__aarch64__)
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)
-class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
-public:
+static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
+#if defined(__ARM_FEATURE_SVE)
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "interleaved_fp16_mla_3VLx8",
+ [](const GemmArgs<__fp16> &args) { return (args._Ksize > 4); },
+ [](const GemmArgs<__fp16> &args) { return true; },
+ [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
+},
+#endif
+#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "hgemm_24x8",
+ [](const GemmArgs<__fp16> &args) {
#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- bool is_supported(const GemmArgs<__fp16> &args) override {
return args._ci->has_fp16();
- }
-#endif
-
- UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
- return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args));
- }
-
- GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { }
-};
-#endif
-
-#endif // __aarch64__
-
-class GemmImpl_gemm_fp16_interleaved : public GemmImplementation<__fp16, __fp16> {
-public:
- UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
-#ifdef __aarch64__
- return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args));
-#elif defined(__arm__)
- return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args));
#else
-# error Unknown Architecture
+ return true;
#endif
- }
-
- GemmImpl_gemm_fp16_interleaved() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
-static GemmImpl_gemm_fp16_interleaved_fp16 gemm_fp16_interleaved_fp16_impl{};
+ },
+ [](const GemmArgs<__fp16> &args) { return true; },
+ [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
+},
#endif
-static GemmImpl_gemm_fp16_interleaved gemm_fp16_interleaved_impl{};
-
-static std::vector<GemmImplementation<__fp16, __fp16> *> gemm_fp16_methods = {
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
- &gemm_fp16_interleaved_fp16_impl,
+#if defined(__arm__)
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "sgemm_8x6",
+ [](const GemmArgs<__fp16> &args) { return true; },
+ [](const GemmArgs<__fp16> &args) { return true; },
+ [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args); }
+},
#endif
- &gemm_fp16_interleaved_impl
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr,
+}
};
template<>
-std::vector<GemmImplementation<__fp16, __fp16> *> &gemm_implementation_list<__fp16, __fp16>() {
+const GemmImplementation<__fp16, __fp16> *gemm_implementation_list<__fp16, __fp16>() {
return gemm_fp16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(GemmArgs<__fp16> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<__fp16, __fp16>(GemmArgs<__fp16> &args);
-template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, GemmArgs<__fp16> &args);
+template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(const GemmArgs<__fp16> &args);
+template KernelDescription get_gemm_method<__fp16, __fp16>(const GemmArgs<__fp16> &args);
+template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, const GemmArgs<__fp16> &args);
+template std::vector<std::string> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
} // namespace arm_gemm
-#endif // __ARM_FP16_ARGS
+#endif // __ARM_FP16_ARGS
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 7d14971..8bc33cc 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_gemm.hpp"
#include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
#include "gemm_native.hpp"
@@ -30,112 +31,140 @@
#include "gemv_native_transposed.hpp"
#include "gemv_pretransposed.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
#include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_sgemv_trans.hpp"
-#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
#include "kernels/a64_sgemm_native_16x4.hpp"
+#include "kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp"
+#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemv_trans.hpp"
+#include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp"
#include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
+#include "kernels/sve_native_fp32_mla_4VLx4.hpp"
+#include "kernels/sve_smallK_fp32_mla_1VLx4.hpp"
+#include "kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp"
namespace arm_gemm {
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-// SGEMM implementations for AArch64 without SVE
+static const GemmImplementation<float, float> gemm_fp32_methods[] =
+{
+{
+ GemmMethod::GEMV_BATCHED,
+ "gemv_batched",
+ [](const GemmArgs<float> &args) { return (args._Msize==1) && (args._nbatches>1); },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemvBatched<float, float>(args); }
+},
+#ifdef __aarch64__
+{
+ GemmMethod::GEMV_PRETRANSPOSED,
+ "sgemv_pretransposed",
+ [](const GemmArgs<float> &args) { return (args._Msize==1 && args._alpha==1.0f && args._pretransposed_hint && args._nbatches==1); },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemvPretransposed<sgemv_pretransposed, float, float>(args); }
+},
+{
+ GemmMethod::GEMV_NATIVE_TRANSPOSED,
+ "sgemv_trans",
+ [](const GemmArgs<float> &args) { return (args._Msize==1 && args._alpha==1.0f && !args._trA && !args._trB && args._nbatches==1); },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemvNativeTransposed<sgemv_trans, float, float>(args); }
+},
-// Pretransposed GEMV
-class GemmImpl_sgemm_gemv_pretransposed : public GemmImplementation<float, float> {
-public:
- bool is_supported(const GemmArgs<float> &args) override {
- return (args._Msize==1 && args._alpha==1.0f && args._pretransposed_hint && args._nbatches==1);
- }
+#ifdef __ARM_FEATURE_SVE
+ // SVE smallk / native / hybrid methods
+{
+ GemmMethod::GEMM_HYBRID,
+ "smallK_hybrid_fp32_mla_1VLx4",
+ [](const GemmArgs<float> &args) { return (args._Ksize <= 24) && !args._trA && args._alpha==1.0f && args._pretransposed_hint; },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_1VLx4, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_fp32_mla_4VLx4",
+ [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmHybrid<hybrid_fp32_mla_4VLx4, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_NATIVE,
+ "smallK_fp32_mla_1VLx4",
+ [](const GemmArgs<float> &args) { return (args._Ksize <= 24) && !args._trA && !args._trB && args._alpha==1.0f; },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmNative<smallK_fp32_mla_1VLx4, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_NATIVE,
+ "native_fp32_mla_4VLx4",
+ [](const GemmArgs<float> &args) { return (args._Ksize>4 && args._alpha==1.0f && !args._trA && !args._trB); },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmNative<native_fp32_mla_4VLx4, float, float>(args); }
+},
+#endif // __ARM_FEATURE_SVE
- UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
- return UniqueGemmCommon<float, float> (new GemvPretransposed<sgemv_pretransposed, float, float>(args._ci, args._Nsize, args._Ksize, args._nmulti, args._trB, args._beta));
- }
+// NEON native / hybrid methods
+{
+ GemmMethod::GEMM_HYBRID,
+ "sgemm_nativeA_pretransposeB_16x4",
+ [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmHybrid<sgemm_nativeA_pretransposeB_16x4, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_NATIVE,
+ "sgemm_native_16x4",
+ [](const GemmArgs<float> &args) { return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB); },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmNative<sgemm_native_16x4, float, float>(args); }
+},
- GemmImpl_sgemm_gemv_pretransposed() : GemmImplementation<float, float>(GemmMethod::GEMV_PRETRANSPOSED) { }
-};
-
-// Native GEMV
-class GemmImpl_sgemm_gemv_native_transposed : public GemmImplementation<float, float> {
-public:
- bool is_supported(const GemmArgs<float> &args) override {
- return (args._Msize==1 && args._alpha==1.0f && !args._trA && !args._trB && args._nbatches==1);
- }
-
- UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
- return UniqueGemmCommon<float, float> (new GemvNativeTransposed<sgemv_trans, float, float>(args._ci, args._Nsize, args._Ksize, args._nmulti, args._beta));
- }
-
- GemmImpl_sgemm_gemv_native_transposed() : GemmImplementation<float, float>(GemmMethod::GEMV_NATIVE_TRANSPOSED) { }
-};
-
-// Native GEMM
-class GemmImpl_sgemm_gemm_native : public GemmImplementation<float, float> {
-public:
- bool is_supported(const GemmArgs<float> &args) override {
- return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB);
- }
-
- bool is_recommended(const GemmArgs<float> &args) override {
- return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8));
- }
-
- UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
- return UniqueGemmCommon<float, float> (new GemmNative<sgemm_native_16x4, float, float>(args._ci, args._Msize, args._Nsize, args._Ksize, args._nbatches, args._nmulti, args._beta));
- }
-
- GemmImpl_sgemm_gemm_native() : GemmImplementation<float, float>(GemmMethod::GEMM_NATIVE) { }
-};
+#ifdef __ARM_FEATURE_SVE
+ {
+ GemmMethod::GEMM_INTERLEAVED,
+ "interleaved_fp32_mla_3VLx8",
+ [](const GemmArgs<float> &args) { return (args._Ksize>4); },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
+},
+#endif // __ARM_FEATURE_SVE
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "sgemm_12x8",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
+},
#endif // __aarch64__
-// Interleaved GEMM
-class GemmImpl_sgemm_gemm_interleaved : public GemmImplementation<float, float> {
-public:
- UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
-#ifdef __ARM_FEATURE_SVE
- return UniqueGemmCommon<float, float> (new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args));
-#elif defined(__aarch64__)
- return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_12x8, float, float>(args));
-#elif defined(__arm__)
- return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_8x6, float, float>(args));
-#else
-# error Unknown Architecture.
-#endif
- }
-
- GemmImpl_sgemm_gemm_interleaved() : GemmImplementation<float, float>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemv_batched<float, float> gemv_batched_impl{};
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-static GemmImpl_sgemm_gemv_pretransposed sgemm_gemv_pretransposed_impl{};
-static GemmImpl_sgemm_gemv_native_transposed sgemm_gemv_native_transposed_impl{};
-static GemmImpl_sgemm_gemm_native sgemm_gemm_native_impl{};
-#endif
-static GemmImpl_sgemm_gemm_interleaved sgemm_gemm_interleaved_impl{};
-
-/* List of implementations (order matters) */
-static std::vector<GemmImplementation<float, float> *> SGemmMethods = {
- &gemv_batched_impl,
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
- &sgemm_gemv_pretransposed_impl,
- &sgemm_gemv_native_transposed_impl,
- &sgemm_gemm_native_impl,
-#endif
- &sgemm_gemm_interleaved_impl
+#ifdef __arm__
+ {
+ GemmMethod::GEMM_INTERLEAVED,
+ "sgemm_8x6",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmInterleaved<sgemm_8x6, float, float>(args); }
+},
+#endif // __arm__
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
/* Templated function to return this list. */
template<>
-std::vector<GemmImplementation<float, float> *> &gemm_implementation_list<float, float>() {
- return SGemmMethods;
+const GemmImplementation<float, float> *gemm_implementation_list<float, float>() {
+ return gemm_fp32_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<float, float> gemm<float, float>(GemmArgs<float> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<float, float>(GemmArgs<float> &args);
-template bool method_is_compatible<float, float>(GemmMethod method, GemmArgs<float> &args);
+template UniqueGemmCommon<float, float> gemm<float, float>(const GemmArgs<float> &args);
+template KernelDescription get_gemm_method<float, float>(const GemmArgs<float> &args);
+template bool method_is_compatible<float, float>(GemmMethod method, const GemmArgs<float> &args);
+template std::vector<std::string> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
-} // namespace arm_gemm
+} // namespace arm_gemm
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
new file mode 100644
index 0000000..c2bd0bb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "ndrange.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm {
+
+// Implementation of the GemmCommon abstract class.
+template<typename strategy, typename To, typename Tr>
+class GemmHybrid : public GemmCommon<To, Tr> {
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ /* const properties set by constructor */
+ const CPUInfo * const _ci;
+
+ const unsigned int _Msize;
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const unsigned int _nbatches;
+ const unsigned int _nmulti;
+
+ const bool _trB;
+
+ const Tr _beta;
+
+ /* Blocking info */
+ const unsigned int _k_block;
+ const unsigned int _n_block;
+ const unsigned int _Mround;
+
+ /* Pretransposed buffer. */
+ const Toi *_B_transposed=nullptr;
+
+ const NDRange<4> _window_range;
+
+ static unsigned int compute_k_block(const GemmArgs<Tr> &args) {
+ if (args._cfg && args._cfg->inner_block_size) {
+ return args._cfg->inner_block_size;
+ }
+
+ const unsigned int L1_size = args._ci->get_L1_cache_size();
+
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ k_block /= strategy::k_unroll();
+ k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+ // So divide the space equally into that many blocks.
+ k_block = iceildiv(args._Ksize, numk_blocks);
+
+ // And round UP to the K unroll level required.
+ k_block = roundup(k_block, strategy::k_unroll());
+
+ return k_block;
+ }
+
+ static unsigned int compute_n_block(const GemmArgs<Tr> &args) {
+ if (args._cfg && args._cfg->outer_block_size) {
+ return args._cfg->outer_block_size;
+ }
+
+ const unsigned int k_block = compute_k_block(args);
+ const unsigned int L2_size = args._ci->get_L2_cache_size();
+
+ // n_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+ (sizeof(Toi) * k_block);
+
+ // Needs to be (at least a single) multiple of the kernel output width.
+ n_block /= strategy::out_width();
+ n_block = std::max(n_block, 1U) * strategy::out_width();
+
+ // And tune to the presented problem size.
+ unsigned int numblocks = iceildiv(args._Nsize, n_block);
+ n_block = iceildiv(args._Nsize, numblocks);
+ n_block = roundup(n_block, strategy::out_width());
+
+ return n_block;
+ }
+
+public:
+ GemmHybrid(GemmHybrid &) = delete;
+ GemmHybrid & operator= (GemmHybrid &) = delete;
+
+ /* Constructor */
+ GemmHybrid(const GemmArgs<Tr> &args)
+ : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+ _nbatches(args._nbatches), _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta),
+ _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+ _Mround(roundup(args._Msize, strategy::out_height())),
+ _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { }
+
+ // Interface implementation - Compulsory functions
+ unsigned int get_window_size() const override {
+ return _window_range.total_size();
+ }
+
+ // This kernel can always be dynamically scheduled.
+ bool supports_dynamic_scheduling() const override {
+ return true;
+ }
+
+ // Execute
+ void execute(unsigned int start, unsigned int end, int threadid) override {
+#ifdef CYCLE_PROFILING
+ profiler prof;
+#endif
+ strategy strat(_ci);
+
+ /* Make sure we've been set up correctly. */
+ assert(_B_transposed);
+ static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+ static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+ /* For now, each work item implies all the K for a given output
+ * pixel (so we don't need to synchronize access to the output
+ * array). So separate the loop over K blocks here. */
+ for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+ unsigned int kmax = std::min(k0 + _k_block, _Ksize);
+ unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
+
+ auto p = _window_range.iterator(start, end);
+
+ if (p.done()) {
+ return;
+ }
+
+ do {
+ const unsigned int m_start = p.dim(0) * strategy::out_height();
+ const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize);
+ const unsigned int batch = p.dim(1);
+ const unsigned int n0 = p.dim(2) * _n_block;
+ const unsigned int nmax = std::min(n0 + _n_block, _Nsize);
+ const unsigned int multi = p.dim(3);
+
+ const Toi *b_panel = _B_transposed +
+ (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) +
+ (k0 * roundup(_Nsize, strategy::out_width())) +
+ (n0 * kern_k);
+
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+#endif
+
+ strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
+ b_panel,
+ this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
+ (k0 == 0) ? _beta : static_cast<Tr>(1),
+ (m_end - m_start), (nmax - n0), kern_k);
+ } while (p.next_dim1());
+ }
+ }
+
+ // Interface implementation - pretransposed
+ bool B_is_pretransposed() const override {
+ return true;
+ }
+
+ bool B_pretranspose_required() const override {
+ return (_B_transposed==nullptr);
+ }
+
+ size_t get_B_pretransposed_array_size() const override {
+ return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
+ }
+
+ using GemmCommon<To, Tr>::pretranspose_B_array;
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+ Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
+ _B_transposed = buffer;
+ strategy strat(_ci);
+
+ for (unsigned int multi=0; multi<_nmulti; multi++) {
+ for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+ const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
+ const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll());
+
+ for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
+ const unsigned int xmax = std::min(x0+_n_block, _Nsize);
+
+ const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
+
+ strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
+ x0, xmax, k0, kmax, _trB);
+
+ buffer += size;
+ }
+ }
+ }
+ }
+
+ void set_pretransposed_B_data(void *in_buffer) override {
+ _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 6734e3c..bf80784 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,56 +22,53 @@
* SOFTWARE.
*/
-#include "gemv_batched.hpp"
+#include <arm_gemm.hpp>
+
+#include <functional>
namespace arm_gemm {
template<typename Top, typename Tret>
-class GemmImplementation {
-public:
- /* Is this implementation compatible with the args as provided? */
- virtual bool is_supported(const GemmArgs<Tret> &args) { return true; }
- /* Is this implementation "recommended" for these args (heuristic)? */
- virtual bool is_recommended(const GemmArgs<Tret> &args) { return true; }
- /* Instantiate this method please. */
- virtual UniqueGemmCommon<Top, Tret> instantiate(const GemmArgs<Tret> &args) = 0;
-
- /* Indicate the "GemmMethod" for use as a selector */
- const GemmMethod method;
-
- virtual ~GemmImplementation() { }
-
- GemmImplementation(GemmMethod method) : method(method) { }
-};
-
-/* "gemv_batched" implementation is type-agnostic, so template it here. */
-template<typename Top, typename Tret>
-class GemmImpl_gemv_batched : public GemmImplementation<Top, Tret> {
-public:
- bool is_supported(const GemmArgs<Tret> &args) override {
- return (args._Msize==1 && args._nbatches > 1);
- }
-
- UniqueGemmCommon<Top, Tret> instantiate(const GemmArgs<Tret> &args) override {
- return UniqueGemmCommon<Top, Tret> (new GemvBatched<Top, Tret>(args));
- }
-
- GemmImpl_gemv_batched() : GemmImplementation<Top, Tret>(GemmMethod::GEMV_BATCHED) { }
+struct GemmImplementation {
+ const GemmMethod method;
+ const char * name;
+ std::function<bool(const GemmArgs<Tret> &)> is_supported;
+ std::function<bool(const GemmArgs<Tret> &)> is_recommended;
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs<Tret> &)> instantiate;
};
/* "Master" function implemented for each valid combination of types.
* Returns a list of GEMM implementation descriptors for processing by the
- * other functions. */
+ * other functions, terminated by an implementation with
+ * method==GemmMethod::DEFAULT. */
template<typename Top, typename Tret>
-std::vector<GemmImplementation<Top, Tret> *> &gemm_implementation_list();
+const GemmImplementation<Top, Tret> *gemm_implementation_list();
+/*
+ * Select a GEMM implementation for the given arguments.
+ *
+ * The logic here returns the first method on the list which supports the
+ * requested problem parameters, matches the provided filters (method and/or
+ * name string match) and recommends itself.
+ *
+ * If there is no such method, it will return the first method which
+ * supports the requested parameters and passes the filters, regardless of
+ * recommendation.
+ *
+ * If no method supports the requested parameters and passes the filters,
+ * this function returns false and doesn't touch the provided pointer
+ * reference.
+ */
template<typename Top, typename Tret>
-GemmImplementation<Top, Tret> *find_implementation(GemmArgs<Tret> &args, GemmConfig *cfg) {
+bool find_implementation(const GemmArgs<Tret> &args, const GemmImplementation<Top, Tret> * &impl) {
auto gemms = gemm_implementation_list<Top, Tret>();
+ const GemmConfig *cfg = args._cfg;
- for(auto &&i : gemms) {
+ const GemmImplementation<Top, Tret> *saved_impl = nullptr;
+
+ for (auto i = gemms; i->method != GemmMethod::DEFAULT; i++) {
/* Skip if this implementation doesn't support these args. */
- if (!i->is_supported(args)) {
+ if (i->is_supported != nullptr && !i->is_supported(args)) {
continue;
}
@@ -80,52 +77,92 @@
continue;
}
- /* If no specific method is requested, check that this method recommends itself. */
- if ((!cfg || cfg->method == GemmMethod::DEFAULT) && !i->is_recommended(args)) {
+ /* Skip if a filter is to be applied and it doesn't match. */
+ if (cfg && cfg->filter != "" && !strstr(i->name, cfg->filter.c_str())) {
continue;
}
- return i;
+ /* At this point, if we don't have a saved implementation, save this
+ * one. This is so that we always return something if a filter
+ * matches, even if it doesn't recommend itself.
+ */
+ if (saved_impl == nullptr) {
+ saved_impl=i;
+ }
+
+ /* Check that this method recommends itself. */
+ if (i->is_recommended != nullptr && !i->is_recommended(args)) {
+ continue;
+ }
+
+ impl=i;
+
+ return true;
}
- return nullptr;
-}
-
-template<typename Top, typename Tret>
-UniqueGemmCommon<Top, Tret> gemm(GemmArgs<Tret> &args, GemmConfig *cfg) {
- auto impl = find_implementation<Top, Tret>(args, cfg);
-
- if (impl) {
- return impl->instantiate(args);
- }
-
- return UniqueGemmCommon<Top, Tret>(nullptr);
-}
-
-template<typename Top, typename Tret>
-GemmMethod get_gemm_method(GemmArgs<Tret> &args) {
- auto impl = find_implementation<Top, Tret>(args, nullptr);
-
- if (impl) {
- return impl->method;
- }
-
- /* This shouldn't happen - there should always be at least one valid implementation. */
- return GemmMethod::DEFAULT;
-}
-
-template<typename Top, typename Tret>
-bool method_is_compatible(GemmMethod method, GemmArgs<Tret> &args) {
- /* Determine if the method is valid by attempting to obtain an implementation specifying this method. */
- GemmConfig cfg(method);
-
- auto impl = find_implementation<Top, Tret>(args, &cfg);
-
- if (impl) {
+ /* We didn't find an option matching the filters that recommended
+ * itself. But if we found something earlier that matched the filters
+ * but wasn't recommended, return it here. */
+ if (saved_impl != nullptr) {
+ impl = saved_impl;
return true;
}
return false;
}
-} // namespace arm_gemm
+template<typename Top, typename Tret>
+std::vector<std::string> get_compatible_kernels(const GemmArgs<Tret> &args) {
+ std::vector<std::string> res;
+
+ auto gemms = gemm_implementation_list<Top, Tret>();
+
+ for (auto i = gemms; i->method != GemmMethod::DEFAULT; i++) {
+ /* Check that this implementation supports the presented problem. */
+ if (i->is_supported != nullptr && !i->is_supported(args)) {
+ continue;
+ }
+
+ res.push_back(i->name);
+ }
+
+ return res;
+}
+
+template<typename Top, typename Tret>
+UniqueGemmCommon<Top, Tret> gemm(const GemmArgs<Tret> &args) {
+ const GemmImplementation<Top, Tret> *impl;
+
+ if (find_implementation<Top, Tret>(args, impl)) {
+ return UniqueGemmCommon<Top, Tret>(impl->instantiate(args));
+ }
+
+ return UniqueGemmCommon<Top, Tret>(nullptr);
+}
+
+template<typename Top, typename Tret>
+KernelDescription get_gemm_method(const GemmArgs<Tret> &args) {
+ const GemmImplementation<Top, Tret> *impl;
+
+ if (find_implementation<Top, Tret>(args, impl)) {
+ return KernelDescription(impl->method, impl->name);
+ }
+
+ /* This shouldn't happen - there should always be at least one valid implementation. */
+ return KernelDescription();
+}
+
+template<typename Top, typename Tret>
+bool method_is_compatible(GemmMethod method, const GemmArgs<Tret> &args) {
+ /* Determine if the method is valid by attempting to obtain an implementation specifying this method. */
+ GemmConfig cfg(method);
+ GemmArgs<Tret> myargs = args;
+
+ myargs._cfg = &cfg;
+
+ const GemmImplementation<Top, Tret> *impl;
+
+ return find_implementation<Top, Tret>(myargs, impl);
+}
+
+} // namespace arm_gemm
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index ad171a7..b4503dd 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,30 +32,33 @@
namespace arm_gemm {
-class GemmImpl_gemm_s16_interleaved : public GemmImplementation<int16_t, int32_t> {
-public:
- UniqueGemmCommon<int16_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
- return UniqueGemmCommon<int16_t, int32_t>(new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(args));
- }
-
- GemmImpl_gemm_s16_interleaved() : GemmImplementation<int16_t, int32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_s16_interleaved gemm_s16_interleaved_impl{};
-
-static std::vector<GemmImplementation<int16_t, int32_t> *> gemm_s16_methods = {
- &gemm_s16_interleaved_impl
+static const GemmImplementation<int16_t, int32_t> gemm_s16_methods[] = {
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_s16_12x8",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(args); }
+},
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
template<>
-std::vector<GemmImplementation<int16_t, int32_t> *> &gemm_implementation_list<int16_t, int32_t>() {
+const GemmImplementation<int16_t, int32_t> *gemm_implementation_list<int16_t, int32_t>() {
return gemm_s16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(GemmArgs<int32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<int16_t, int32_t>(GemmArgs<int32_t> &args);
-template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, GemmArgs<int32_t> &args);
+template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const GemmArgs<int32_t> &args);
+template KernelDescription get_gemm_method<int16_t, int32_t>(const GemmArgs<int32_t> &args);
+template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
+template std::vector<std::string> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 627d8ab..5811c2a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,69 +25,78 @@
#include "arm_gemm.hpp"
#include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
#include "kernels/a64_gemm_s16_12x8.hpp"
#include "kernels/a64_gemm_s8_12x8.hpp"
#include "kernels/a64_gemm_s8_4x4.hpp"
+#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
+#include "kernels/sve_native_s8s32_dot_4VLx4.hpp"
namespace arm_gemm {
+static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
#ifdef __ARM_FEATURE_SVE
-class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
-public:
- UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
- return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args));
- }
-
- GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
-#else
-
-class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
-public:
- bool is_supported(const GemmArgs<int32_t> &args) override {
- return args._ci->has_dotprod();
- }
-
- UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
- return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args));
- }
-
- GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
-
+{
+ GemmMethod::GEMM_NATIVE,
+ "native_s8s32_dot_4VLx4",
+ [](const GemmArgs<int32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
+ [](const GemmArgs<int32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+ [](const GemmArgs<int32_t> &args) { return new GemmNative<native_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "interleaved_s8s32_dot_3VLx8",
+ [](const GemmArgs<int32_t> &args) { return (args._Ksize>4); },
+ nullptr,
+ [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args); }
+},
#endif
-
-class GemmImpl_gemm_s8_interleaved : public GemmImplementation<int8_t, int32_t> {
-public:
- UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
- return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args));
- }
-
- GemmImpl_gemm_s8_interleaved() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_s8_interleaved_dot gemm_s8_interleaved_dot_impl{};
-static GemmImpl_gemm_s8_interleaved gemm_s8_interleaved_impl{};
-
-static std::vector<GemmImplementation<int8_t, int32_t> *> gemm_s8_methods = {
- &gemm_s8_interleaved_dot_impl,
- &gemm_s8_interleaved_impl
+{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_s8s32_dot_16x4",
+ [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<int32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
+ [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_s8_12x8",
+ [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod(); },
+ nullptr,
+ [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_s8_4x4",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args); }
+},
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
template<>
-std::vector<GemmImplementation<int8_t, int32_t> *> &gemm_implementation_list<int8_t, int32_t>() {
+const GemmImplementation<int8_t, int32_t> *gemm_implementation_list<int8_t, int32_t>() {
return gemm_s8_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(GemmArgs<int32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<int8_t, int32_t>(GemmArgs<int32_t> &args);
-template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, GemmArgs<int32_t> &args);
+template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const GemmArgs<int32_t> &args);
+template KernelDescription get_gemm_method<int8_t, int32_t>(const GemmArgs<int32_t> &args);
+template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
+template std::vector<std::string> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 0e58a4d..b83ccd3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -318,50 +318,57 @@
/* Constructor */
GemmInterleaved(const GemmArgs<Tr> &args)
- : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
- _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),
- _alpha(args._alpha), _beta(args._beta), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
- _pretransposed(args._pretransposed_hint) {
+ : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+ _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),
+ _alpha(args._alpha), _beta(args._beta), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+ _pretransposed(args._pretransposed_hint) {
const unsigned int L1_size = _ci->get_L1_cache_size();
const unsigned int L2_size = _ci->get_L2_cache_size();
assert(_maxthreads > 0);
- // Work out blocking parameters
+ // Work out blocking parameters, or override from provided GemmConfig
+ if (args._cfg && args._cfg->inner_block_size) {
+ _k_block = args._cfg->inner_block_size;
+ } else {
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
- // k_block: Find out how much of the larger array can be loaded into half the cache.
- // This should account for associative caches.
- _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+ // Needs to be (at least a single) multiple of the K unroll level.
+ _k_block /= strategy::k_unroll();
+ _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
- // Needs to be (at least a single) multiple of the K unroll level.
- _k_block /= strategy::k_unroll();
- _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
+ // Now tune to presented problem size; this is how many blocks we need.
+ unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
- // Now tune to presented problem size; this is how many blocks we need.
- int num_k_blocks = iceildiv(_Ksize, _k_block);
+ // So divide the space equally into that many blocks.
+ _k_block = iceildiv(_Ksize, num_k_blocks);
- // So divide the space equally into that many blocks.
- _k_block = iceildiv(_Ksize, num_k_blocks);
+ // And round UP to the K unroll level required.
+ _k_block = iceildiv(_k_block, strategy::k_unroll());
+ _k_block *= strategy::k_unroll();
+ }
- // And round UP to the K unroll level required.
- _k_block = iceildiv(_k_block, strategy::k_unroll());
- _k_block *= strategy::k_unroll();
+ if (args._cfg && args._cfg->outer_block_size) {
+ _x_block = args._cfg->outer_block_size;
+ } else {
+ // x_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+ (sizeof(Toi) * _k_block);
- // x_block: Work out how many rows (of length k_block) will fit in the L2
- // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
- _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
- (sizeof(Toi) * _k_block);
+ // Needs to be (at least a single) multiple of the kernel output width.
+ _x_block /= strategy::out_width();
+ _x_block = std::max(_x_block, 1U) * strategy::out_width();
- // Needs to be (at least a single) multiple of the kernel output width.
- _x_block /= strategy::out_width();
- _x_block = std::max(_x_block, 1U) * strategy::out_width();
+ // And tune to the presented problem size.
+ unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
+ _x_block = iceildiv(_Nsize, num_x_blocks);
- // And tune to the presented problem size.
- int num_x_blocks = iceildiv(_Nsize, _x_block);
- _x_block = iceildiv(_Nsize, num_x_blocks);
-
- _x_block = iceildiv(_x_block, strategy::out_width());
- _x_block *= strategy::out_width();
+ _x_block = iceildiv(_x_block, strategy::out_width());
+ _x_block *= strategy::out_width();
+ }
// Work out the rounded size of M - needed for some buffers.
_Mround = iceildiv(_Msize, strategy::out_height());
@@ -457,8 +464,8 @@
do {
/* Figure out the size of each block. */
- size_t x_size = (current.xmax() - current.x0());
- size_t k_size = (current.kmax() - current.k0());
+ unsigned int x_size = (current.xmax() - current.x0());
+ unsigned int k_size = (current.kmax() - current.k0());
/* Round sizes up as needed. */
x_size = iceildiv(x_size, strategy::out_width());
@@ -473,6 +480,7 @@
return total;
}
+ using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
blockwalker current(*this);
Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
@@ -481,8 +489,8 @@
do {
/* Figure out the size of each block. */
- size_t x_size = (current.xmax() - current.x0());
- size_t k_size = (current.kmax() - current.k0());
+ unsigned int x_size = (current.xmax() - current.x0());
+ unsigned int k_size = (current.kmax() - current.k0());
/* Round sizes up as needed. */
x_size = iceildiv(x_size, strategy::out_width());
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index baa1316..98516b1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,8 +27,7 @@
#include "arm_gemm.hpp"
-#include "mergeresults.hpp"
-#include "transform.hpp"
+#include "ndrange.hpp"
#ifdef CYCLE_PROFILING
#include "profiler.hpp"
@@ -55,35 +54,46 @@
const unsigned int _nbatches;
const unsigned int _nmultis;
- Tr _beta;
+ const Tr _beta;
const CPUInfo * const _ci;
- unsigned int k_block=0;
- unsigned int n_block=0;
+ const unsigned int _k_block;
+ const unsigned int _n_block;
- unsigned int window_per_batch() const {
- return iceildiv(_Msize, strategy::out_height());
+ const NDRange<4> _window_range;
+
+ static unsigned int compute_k_block(const GemmArgs<Tr> &args) {
+ return args._Ksize;
}
- unsigned int window_per_multi() const {
- return window_per_batch() * _nbatches;
+ static unsigned int compute_n_block(const GemmArgs<Tr> &args) {
+ if ((args._cfg != nullptr) && args._cfg->outer_block_size > 0) {
+ return args._cfg->outer_block_size;
+ } else {
+ return args._Nsize;
+ }
}
public:
GemmNative(GemmNative &) = delete;
GemmNative & operator= (GemmNative &) = delete;
- GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta) :
- _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci) {
- /* For now don't do any blocking. TODO: figure out if we should. */
- k_block = K;
- n_block = N;
- }
+ GemmNative(const GemmArgs<Tr> &args)
+ : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+ _nbatches(args._nbatches), _nmultis(args._nmulti),
+ _beta(args._beta), _ci(args._ci),
+ _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+ _window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { }
// Window is amount per multi multiplied by total number of multis.
unsigned int get_window_size() const override {
- return window_per_multi() * _nmultis;
+ return _window_range.total_size();
+ }
+
+ // Native GEMMs can always be dynamically scheduled (whether requested or not)
+ bool supports_dynamic_scheduling() const override {
+ return true;
}
// Actually execute the GEMM.
@@ -96,40 +106,30 @@
static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
- /* Compute starting point based on 'start' */
- unsigned int multi = start / window_per_multi();
- unsigned int multi_pos = start % window_per_multi();
+ auto p = _window_range.iterator(start, end);
- unsigned int batch = multi_pos / window_per_batch();
- unsigned int batch_pos = multi_pos % window_per_batch();
+ if (p.done()) {
+ return;
+ }
- unsigned int y0 = batch_pos * strategy::out_height();
+ do {
+ unsigned int y0 = p.dim(0) * strategy::out_height();
+ unsigned int ymax = std::min(p.dim0_max() * strategy::out_height(), _Msize);
+ unsigned int batch = p.dim(1);
+ unsigned int n0 = p.dim(2) * _n_block;
+ unsigned int nmax = std::min(n0 + _n_block, _Nsize);
+ unsigned int multi = p.dim(3);
- for (unsigned int pos=start; pos<end; pos++) {
- const unsigned int ymax = std::min(y0 + strategy::out_height(), _Msize);
#ifdef CYCLE_PROFILING
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * _Nsize * _Ksize);
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * (nmax - n0) * _Ksize);
#endif
strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (y0 * this->_lda), this->_lda,
- this->_Bptr + (multi * this->_B_multi_stride), this->_ldb,
- this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc), this->_ldc,
- _beta, (ymax-y0), _Nsize, _Ksize);
-
- /* Advance to next item */
- y0 += strategy::out_height();
-
- /* Check for batch/multi overflow */
- if (y0 >= _Msize) {
- y0=0;
- batch++;
- if (batch == _nbatches) {
- batch=0;
- multi++;
- }
- }
- }
+ this->_Bptr + (multi * this->_B_multi_stride) + n0, this->_ldb,
+ this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc) + n0, this->_ldc,
+ _beta, (ymax-y0), (nmax - n0), _Ksize);
+ } while (p.next_dim1());
}
};
-} // namespace arm_gemm
+} // namespace arm_gemm
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index feea482..6bcbca9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,31 +32,34 @@
namespace arm_gemm {
-class GemmImpl_gemm_u16_interleaved : public GemmImplementation<uint16_t, uint32_t> {
-public:
- UniqueGemmCommon<uint16_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
- return UniqueGemmCommon<uint16_t, uint32_t>(new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(args));
- }
-
- GemmImpl_gemm_u16_interleaved() : GemmImplementation<uint16_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_u16_interleaved gemm_u16_interleaved_impl{};
-
-static std::vector<GemmImplementation<uint16_t, uint32_t> *> gemm_u16_methods = {
- &gemm_u16_interleaved_impl
+static const GemmImplementation<uint16_t, uint32_t> gemm_u16_methods[] = {
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_u16_12x8",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(args); }
+},
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
template<>
-std::vector<GemmImplementation<uint16_t, uint32_t> *> &gemm_implementation_list<uint16_t, uint32_t>() {
+const GemmImplementation<uint16_t, uint32_t> *gemm_implementation_list<uint16_t, uint32_t>() {
return gemm_u16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(GemmArgs<uint32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<uint16_t, uint32_t>(GemmArgs<uint32_t> &args);
-template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, GemmArgs<uint32_t> &args);
+template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template KernelDescription get_gemm_method<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
+template std::vector<std::string> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index b7c1bab..b95ca80 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,64 +27,75 @@
#include "gemm_common.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
+#include "gemm_hybrid.hpp"
+#include "gemm_native.hpp"
#include "kernels/a64_gemm_u16_12x8.hpp"
#include "kernels/a64_gemm_u8_12x8.hpp"
#include "kernels/a64_gemm_u8_4x4.hpp"
+#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
+#include "kernels/sve_native_u8u32_dot_4VLx4.hpp"
namespace arm_gemm {
+static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
#ifdef __ARM_FEATURE_SVE
-class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
-public:
- UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
- return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args));
- }
-
- GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
-#else
-class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
-public:
- bool is_supported(const GemmArgs<uint32_t> &args) override {
- return args._ci->has_dotprod();
- }
-
- UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
- return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args));
- }
-
- GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
+{
+ GemmMethod::GEMM_NATIVE,
+ "native_u8u32_dot_4VLx4",
+ [](const GemmArgs<uint32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
+ [](const GemmArgs<uint32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+ [](const GemmArgs<uint32_t> &args) { return new GemmNative<native_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "interleaved_u8u32_dot_3VLx8",
+ [](const GemmArgs<uint32_t> &args) { return (args._Ksize>4); },
+ nullptr,
+ [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args); }
+},
#endif
-
-class GemmImpl_gemm_u8_interleaved : public GemmImplementation<uint8_t, uint32_t> {
-public:
- UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
- return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args));
- }
-
- GemmImpl_gemm_u8_interleaved() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_u8_interleaved_dot gemm_u8_interleaved_dot_impl{};
-static GemmImpl_gemm_u8_interleaved gemm_u8_interleaved_impl{};
-
-static std::vector<GemmImplementation<uint8_t, uint32_t> *> gemm_u8_methods = {
- &gemm_u8_interleaved_dot_impl,
- &gemm_u8_interleaved_impl
+{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_u8u32_dot_16x4",
+ [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<uint32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
+ [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_u8_12x8",
+ [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod(); },
+ nullptr,
+ [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_u8_4x4",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+},
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
template<>
-std::vector<GemmImplementation<uint8_t, uint32_t> *> &gemm_implementation_list<uint8_t, uint32_t>() {
+const GemmImplementation<uint8_t, uint32_t> *gemm_implementation_list<uint8_t, uint32_t>() {
return gemm_u8_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(GemmArgs<uint32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<uint8_t, uint32_t>(GemmArgs<uint32_t> &args);
-template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, GemmArgs<uint32_t> &args);
+template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template KernelDescription get_gemm_method<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
+template std::vector<std::string> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index d65971e..32d668f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,9 +41,10 @@
GemmArgs<Tr> newargs = args;
newargs._Msize = args._nbatches;
newargs._nbatches = 1;
- _subgemm = gemm<To,Tr>(newargs, nullptr);
+ _subgemm = gemm<To,Tr>(newargs);
}
+ using GemmCommon<To, Tr>::set_arrays;
void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
const To *B, const int ldb, const int B_multi_stride,
Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
@@ -85,6 +86,7 @@
return _subgemm->get_B_pretransposed_array_size();
}
+ using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
_subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index 241c5fe..5ebc634 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -64,15 +64,16 @@
GemvNativeTransposed(GemvNativeTransposed &) = delete;
GemvNativeTransposed & operator= (GemvNativeTransposed &) = delete;
- GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta) : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci) {
+ GemvNativeTransposed(const GemmArgs<Tr> &args)
+ : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _beta(args._beta), _ci(args._ci) {
/* For now don't do any blocking. TODO: figure out if we should. */
- m_block = K;
- n_block = N;
+ m_block = _Ksize;
+ n_block = _Nsize;
}
// Window is number of out_width blocks times number of multis.
unsigned int get_window_size() const override {
- return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+ return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
}
// Actually execute the GEMV.
@@ -82,12 +83,12 @@
#endif
strategy strat(_ci);
- const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+ const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
const unsigned int multi_0 = start / window_per_multi;
const unsigned int multi_end = end / window_per_multi;
- const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width;
- const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+ const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width();
+ const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width();
static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index e53ddb2..f7beb0a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,17 +68,26 @@
GemvPretransposed(GemvPretransposed &) = delete;
GemvPretransposed & operator= (GemvPretransposed &) = delete;
- GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta) :
- _Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci),
- _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) {
+ GemvPretransposed(const GemmArgs<Tr> &args)
+ : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _trB(args._trB), _beta(args._beta), _ci(args._ci),
+ _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) {
/* For now don't do any blocking. TODO: figure out if we should. */
- m_block = K;
- n_block = N;
+ if (args._cfg && args._cfg->inner_block_size) {
+ m_block = args._cfg->inner_block_size;
+ } else {
+ m_block = _Ksize;
+ }
+
+ if (args._cfg && args._cfg->outer_block_size) {
+ n_block = args._cfg->outer_block_size;
+ } else {
+ n_block = _Nsize;
+ }
}
// Window is number of out_width blocks, times number of multis.
unsigned int get_window_size() const override {
- return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+ return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
}
// Actually execute the GEMV.
@@ -89,13 +98,13 @@
strategy strat(_ci);
/* Break the window values down into multis of interest... */
- const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+ const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
const unsigned int multi_0 = start / window_per_multi;
const unsigned int multi_end = end / window_per_multi;
/* ... and figure out where we start and end in the first and last multi. */
- const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width;
- const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+ const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width();
+ const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width();
static_assert(std::is_same<Tr, Tri>::value, "GemvPretransposed: Result types must be the same.");
@@ -115,8 +124,8 @@
auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n));
#endif
/* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
- strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave),
- (_Ksize * strategy::A_interleave),
+ strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave()),
+ (_Ksize * strategy::A_interleave()),
this->_Aptr + (multi * this->_A_multi_stride) + m0,
this->_Cptr + (multi * this->_C_multi_stride) + n,
_beta, (mmax-m0), (nmax-n));
@@ -139,6 +148,7 @@
return _buffer_per_multi * _nmultis * sizeof(To);
}
+ using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
@@ -146,10 +156,10 @@
/* Reverse sense here as we are dealing with B rather than A. So if
* strategy::A_transpose is false and _trB is false, we still
* transpose. */
- if (_trB ^ strategy::A_transpose) {
- Transform<strategy::A_interleave, strategy::A_block, false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+ if (_trB ^ strategy::A_transpose()) {
+ Transform<strategy::A_interleave(), strategy::A_block(), false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
} else {
- Transform<strategy::A_interleave, strategy::A_block, true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+ Transform<strategy::A_interleave(), strategy::A_block(), true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
index 06e6245..2349722 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,15 +50,15 @@
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 8;
}
- static int out_height() {
+ static unsigned int out_height() {
return 6;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
index 95a2bc2..2fcb587 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,15 +48,15 @@
typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
index fdc0200..cc205dc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,15 +43,15 @@
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index be7ead9..71c666a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,15 +42,15 @@
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 4;
}
- static int out_height() {
+ static unsigned int out_height() {
return 4;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 16;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
index d2692ba..3d5c92c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,15 +48,15 @@
typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
index a252abf..9032ba6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,15 +53,15 @@
static const bool B_transpose = true;
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 2da3ecd..fda7657 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,15 +50,15 @@
static const bool B_transpose = true;
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 4;
}
- static int out_height() {
+ static unsigned int out_height() {
return 4;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 16;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
index 911a4eb..5b850b7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,15 +47,15 @@
typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 24;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
index 418a375..4ad38cb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,9 +32,9 @@
// Kernel implementation.
//
// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
+// 24x8), the chunks being arranged in a row major fashion.
//
// Note that the intent of this is that either ablocks or bblocks will be 1
// - this construction allows the output loop to proceed in either order.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
new file mode 100644
index 0000000..c8934df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+class hybrid_s8s32_dot_16x4
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_s8s32_dot_16x4;
+
+ hybrid_s8s32_dot_16x4(const CPUInfo *ci)
+ {
+ if (ci->get_cpu_model() == CPUModel::A55r1) {
+ kernel = a64_hybrid_s8s32_dot_16x4_a55;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
new file mode 100644
index 0000000..48bf842
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
@@ -0,0 +1,2271 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+
+ for (int y=0; y<M; y+=4) {
+ const int8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(int8_t);
+
+ int32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(int32_t);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const int32_t *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ const int8_t *a_ptr0 = a_ptr0_base;
+ const int8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d2, [a_ptr2, #0x10]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #0x18]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d2, [a_ptr2, #0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x18]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr d3, [a_ptr3, #0x10]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x18]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
new file mode 100644
index 0000000..0179139
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
@@ -0,0 +1,1605 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+
+ for (int y=0; y<M; y+=4) {
+ const int8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(int8_t);
+
+ int32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(int32_t);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const int32_t *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ const int8_t *a_ptr0 = a_ptr0_base;
+ const int8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #0x10]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
new file mode 100644
index 0000000..7fb9b5c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+class hybrid_u8u32_dot_16x4
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_u8u32_dot_16x4;
+
+ hybrid_u8u32_dot_16x4(const CPUInfo *ci)
+ {
+ if (ci->get_cpu_model() == CPUModel::A55r1) {
+ kernel = a64_hybrid_u8u32_dot_16x4_a55;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
new file mode 100644
index 0000000..230ecdc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
@@ -0,0 +1,2271 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0u);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+
+ for (int y=0; y<M; y+=4) {
+ const uint8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(uint8_t);
+
+ uint32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const uint32_t *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ const uint8_t *a_ptr0 = a_ptr0_base;
+ const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d2, [a_ptr2, #0x10]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #0x18]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d2, [a_ptr2, #0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x18]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr d3, [a_ptr3, #0x10]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x18]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
new file mode 100644
index 0000000..dbef029
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
@@ -0,0 +1,1605 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0u);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+
+ for (int y=0; y<M; y+=4) {
+ const uint8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(uint8_t);
+
+ uint32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const uint32_t *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ const uint8_t *a_ptr0 = a_ptr0_base;
+ const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #0x10]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
index 10d1069..3c0395a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,15 +51,15 @@
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
new file mode 100644
index 0000000..95e3712
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void a64_sgemm_nativeA_pretransposeB_16x4(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int);
+
+// Native A/Pretranspose B SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPUInfo
+// structure.
+class sgemm_nativeA_pretransposeB_16x4 {
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int);
+
+ /* Desired data layout for B buffer (used for pretranspose) */
+ static const int B_interleave = 16;
+ static const int B_block = 1;
+ static const bool B_transpose = true;
+
+ /* Kernel blocking parameters */
+ static unsigned int out_width() {
+ return 16;
+ }
+
+ static unsigned int out_height() {
+ return 4;
+ }
+
+ static unsigned int k_unroll() {
+ return 1;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 4, 16> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_sgemm_nativeA_pretransposeB_16x4;
+
+ sgemm_nativeA_pretransposeB_16x4(const CPUInfo *ci) {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp
new file mode 100644
index 0000000..b2516f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp
@@ -0,0 +1,970 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+
+#include <arm_neon.h>
+
+namespace arm_gemm {
+
+void a64_sgemm_nativeA_pretransposeB_16x4(const float *A, int lda, const float *B_panel, float *C, int ldc, float beta, unsigned int numrows, unsigned int numcols, unsigned int K) {
+ const bool oddk = ((K % 8) >= 4);
+ const bool beta0 = (beta == 0.0f);
+ const unsigned int oddones = (K % 4);
+
+ /* Use some small temporary arrays to cope with "ragged" M/N sizes.
+ *
+ * "dummy_A_buf" is used to avoid overreading the A input for ragged M,
+ * and also for output if N is not ragged.
+ *
+ * Since the B input is pretransposed it will be padded as needed, so no
+ * need to worry about overreading that.
+ *
+ * "C_buf" is used to avoid overreading or overwriting the output for
+ * ragged N cases.
+ */
+ float dummy_A_buf[16];
+ float C_buf[64];
+
+ std::memset(dummy_A_buf, 0, sizeof(dummy_A_buf));
+ std::memset(C_buf, 0, sizeof(C_buf));
+
+ for (unsigned int y=0; y<numrows; y+=4) {
+ const float *b_ptr = B_panel;
+ const unsigned int active_rows = std::min(numrows - y, 4U);
+
+ /* Increment values to be used to advance A pointers - these get set
+ * to zero when the corresponding row isn't being used due to ragged
+ * M, so it will just read the dummy buffer repeatedly. Values are
+ * in bytes (8x sizeof(float)). */
+ const unsigned long a_incr1 = (active_rows > 1) ? 32 : 0;
+ const unsigned long a_incr2 = (active_rows > 2) ? 32 : 0;
+ const unsigned long a_incr3 = (active_rows > 3) ? 32 : 0;
+
+ /* Starting points for A pointers on this loop */
+ const float * const a_ptr0_base = A + (y * lda);
+ const float * const a_ptr1_base = (active_rows > 1) ? (a_ptr0_base + lda) : dummy_A_buf;
+ const float * const a_ptr2_base = (active_rows > 2) ? (a_ptr1_base + lda) : dummy_A_buf;
+ const float * const a_ptr3_base = (active_rows > 3) ? (a_ptr2_base + lda) : dummy_A_buf;
+
+ /* Starting points for C pointers on this loop */
+ float *c_ptr0 = C + (y * ldc);
+ float *c_ptr1 = (active_rows > 1) ? (c_ptr0 + ldc) : dummy_A_buf;
+ float *c_ptr2 = (active_rows > 2) ? (c_ptr1 + ldc) : dummy_A_buf;
+ float *c_ptr3 = (active_rows > 3) ? (c_ptr2 + ldc) : dummy_A_buf;
+
+ for (unsigned int x0=0; x0<numcols; x0+=16) {
+ const unsigned int active_cols = std::min(numcols - x0, 16U);
+ const bool use_result_buf = (active_cols < 16);
+
+ /* Reset the A pointers for this loop. */
+ const float *a_ptr0 = a_ptr0_base;
+ const float *a_ptr1 = a_ptr1_base;
+ const float *a_ptr2 = a_ptr2_base;
+ const float *a_ptr3 = a_ptr3_base;
+
+ /* Override C pointers if the result buffer is in use. */
+ if (use_result_buf) {
+ c_ptr0 = C_buf;
+ c_ptr1 = C_buf + 16;
+ c_ptr2 = C_buf + 32;
+ c_ptr3 = C_buf + 48;
+
+ /* If beta is non-zero, prepopulate the result buffer */
+ if (!beta0) {
+ for (unsigned int row=0; row<active_rows; row++) {
+ for (unsigned int col=0; col<active_cols; col++) {
+ C_buf[row * 16 + col] = C[((y + row) * ldc) + (x0 + col)];
+ }
+ }
+ }
+ }
+
+ unsigned int loops = ((K+4)/8) - 1;
+ unsigned int odds = oddones;
+
+ __asm __volatile (
+ "a0 .req v0\n"
+ "a1 .req v1\n"
+ "a2 .req v2\n"
+ "a3 .req v3\n"
+ "a0a .req v4\n"
+ "a1a .req v5\n"
+ "a2a .req v6\n"
+ "a3a .req v7\n"
+ "bb0 .req v8\n"
+ "bb1 .req v9\n"
+ "bb2 .req v10\n"
+ "bb3 .req v11\n"
+ "b0a .req v12\n"
+ "b1a .req v13\n"
+ "b2a .req v14\n"
+ "b3a .req v15\n"
+
+ "a0q .req q0\n"
+ "a1q .req q1\n"
+ "a2q .req q2\n"
+ "a3q .req q3\n"
+ "a0aq .req q4\n"
+ "a1aq .req q5\n"
+ "a2aq .req q6\n"
+ "a3aq .req q7\n"
+ "b0q .req q8\n"
+ "b1q .req q9\n"
+ "b2q .req q10\n"
+ "b3q .req q11\n"
+ "b0aq .req q12\n"
+ "b1aq .req q13\n"
+ "b2aq .req q14\n"
+ "b3aq .req q15\n"
+
+ "movi v16.4s, #0x0\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "movi v21.4s, #0x0\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "movi v24.4s, #0x0\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+ "movi v25.4s, #0x0\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+ "movi v26.4s, #0x0\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+ "cbz %w[beta0], 5f\n"
+ "movi v27.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x40]")
+ "movi v28.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x80]")
+ "movi v29.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0xC0]")
+ "movi v30.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]")
+ "movi v31.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x140]")
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+ ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ ASM_PREFETCH("[%[b_ptr], #0x200]")
+
+ // Skip if no complete loops.
+ "cbz %w[loops], 4f\n"
+ "b 1f\n"
+
+ // If beta is non-zero, need to load and multiply by beta
+ "5:\n"
+ "ld1r {v4.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #16]\n"
+ "ldr q18, [%[c_ptr0], #32]\n"
+ "ldr q19, [%[c_ptr0], #48]\n"
+
+ "ldr q20, [%[c_ptr1]]\n"
+ "fmul v16.4s, v16.4s, v4.4s\n"
+ "ldr q21, [%[c_ptr1], #16]\n"
+ "fmul v17.4s, v17.4s, v4.4s\n"
+ "ldr q22, [%[c_ptr1], #32]\n"
+ "fmul v18.4s, v18.4s, v4.4s\n"
+ "ldr q23, [%[c_ptr1], #48]\n"
+ "fmul v19.4s, v19.4s, v4.4s\n"
+
+ "ldr q24, [%[c_ptr2]]\n"
+ "fmul v20.4s, v20.4s, v4.4s\n"
+ "ldr q25, [%[c_ptr2], #16]\n"
+ "fmul v21.4s, v21.4s, v4.4s\n"
+ "ldr q26, [%[c_ptr2], #32]\n"
+ "fmul v22.4s, v22.4s, v4.4s\n"
+ "ldr q27, [%[c_ptr2], #48]\n"
+ "fmul v23.4s, v23.4s, v4.4s\n"
+
+ "ldr q28, [%[c_ptr3]]\n"
+ "fmul v24.4s, v24.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x40]")
+ "ldr q29, [%[c_ptr3], #16]\n"
+ "fmul v25.4s, v25.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x80]")
+ "ldr q30, [%[c_ptr3], #32]\n"
+ "fmul v26.4s, v26.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0xC0]")
+ "ldr q31, [%[c_ptr3], #48]\n"
+ "fmul v27.4s, v27.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]")
+
+ "fmul v28.4s, v28.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x140]")
+ "fmul v29.4s, v29.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+ "fmul v30.4s, v30.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ "fmul v31.4s, v31.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x200]")
+
+ "cbz %w[loops], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x240]")
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr], #64]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #80]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #96]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #112]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x280]")
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr], #128]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "subs %w[loops], %w[loops], #1\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #144]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #160]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #176]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x2C0]")
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr], #192]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "add %[a_ptr1], %[a_ptr1], %[a_incr1]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "add %[a_ptr2], %[a_ptr2], %[a_incr2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #208]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "add %[a_ptr3], %[a_ptr3], %[a_incr3]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[a_ptr0], #0x40]")
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #224]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[a_ptr1], #0x40]")
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #240]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], #512\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr], #-256]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]")
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #-240]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ ASM_PREFETCH("[%[a_ptr2], #0x40]")
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #-224]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #-208]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x140]")
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr], #-192]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #-176]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #-160]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #-144]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr], #-128]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ ASM_PREFETCH("[%[a_ptr3], #0x40]")
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #-112]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #-96]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #-80]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+ "ldr b0q, [%[b_ptr], #-64]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+ "ldr b1q, [%[b_ptr], #-48]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+ "ldr b2q, [%[b_ptr], #-32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+ "ldr b3q, [%[b_ptr], #-16]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x200]")
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "bne 1b\n"
+
+ // Skip to here
+ "4:\n"
+
+ // Detached final iteration
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "cbnz %w[oddk], 2f\n" // Deal with odd K before we load a0a
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "add %[a_ptr1], %[a_ptr1], %[a_incr1]\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "add %[a_ptr2], %[a_ptr2], %[a_incr2]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "add %[a_ptr3], %[a_ptr3], %[a_incr3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr0], #0x40]")
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr1], #0x40]")
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr2], #0x40]")
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr3], #0x40]")
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "cbnz %w[odds], 6f\n"
+
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // Odd K case: Just do 4 more.
+ "2:\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #16\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "add %[a_ptr1], %[a_ptr1], #16\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "add %[a_ptr2], %[a_ptr2], #16\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "add %[a_ptr3], %[a_ptr3], #16\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr0], #0x40]")
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr1], #0x40]")
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr2], #0x40]")
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr3], #0x40]")
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "cbnz %w[odds], 7f\n"
+
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // "Odd ones" - lead in from even
+ "6:\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+ "b 8f\n"
+
+ // "Odd ones" - lead in from odd
+ "7:\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+
+ // "Odd ones" - loop
+ "8:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "bne 8b\n"
+
+ // "Odd ones" - detached final iteration
+ "9:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "str q27, [%[c_ptr2], #48]\n"
+
+ "3:\n"
+ "str q28, [%[c_ptr3]]\n"
+ // Increment C pointers for next loop - this looks odd if we
+ // are using the result buffer, but it's OK as using the
+ // result buffer implies there will be no next loop.
+ "add %[c_ptr0], %[c_ptr0], #64\n"
+ "str q29, [%[c_ptr3], #16]\n"
+ "add %[c_ptr1], %[c_ptr1], %[a_incr1], LSL #1\n"
+ "str q30, [%[c_ptr3], #32]\n"
+ "add %[c_ptr2], %[c_ptr2], %[a_incr2], LSL #1\n"
+ "str q31, [%[c_ptr3], #48]\n"
+ "add %[c_ptr3], %[c_ptr3], %[a_incr3], LSL #1\n"
+
+ : [a_ptr0] "+r" (a_ptr0), [a_ptr1] "+r" (a_ptr1), [a_ptr2] "+r" (a_ptr2), [a_ptr3] "+r" (a_ptr3),
+ [b_ptr] "+r" (b_ptr), [loops] "+r" (loops), [odds] "+r" (odds),
+ [c_ptr0] "+r" (c_ptr0), [c_ptr1] "+r" (c_ptr1), [c_ptr2] "+r" (c_ptr2), [c_ptr3] "+r" (c_ptr3)
+ : [oddk] "r" (oddk), [beta0] "r" (beta0), [betaptr] "r" (&beta),
+ [a_incr1] "r" (a_incr1), [a_incr2] "r" (a_incr2), [a_incr3] "r" (a_incr3)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+ "cc", "memory"
+ );
+
+ /* Copy results from result buffer if needed. */
+ if (use_result_buf) {
+ for (unsigned int row=0; row<active_rows; row++) {
+ for (unsigned int col=0; col<active_cols; col++) {
+ C[((y + row) * ldc) + (x0 + col)] = C_buf[row * 16 + col];
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
index 1a35965..3d2b324 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,15 +46,15 @@
typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 16;
}
- static int out_height() {
+ static unsigned int out_height() {
return 4;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
index a73bc76..f5b4f4a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,13 +46,26 @@
* terms of this standard arrangement, so if the A matrix is in fact the
* B matrix from a GEMM call, the sense of the transpose needs to be
* reversed. */
- static const int A_interleave = 32;
- static const int A_block = 1;
- static const bool A_transpose = false;
+ static constexpr unsigned int A_interleave() {
+ return 32;
+ }
+
+ static constexpr unsigned int A_block() {
+ return 1;
+ }
+
+ static constexpr bool A_transpose() {
+ return false;
+ }
/* Kernel blocking parameters */
- static const int out_width = 32;
- static const int k_unroll = 1;
+ static constexpr unsigned int out_width() {
+ return 32;
+ }
+
+ static constexpr unsigned int k_unroll() {
+ return 1;
+ }
kern_type kernel = a64_sgemv_pretransposed;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
index 18c5c3a..cbaa0cf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,8 +39,13 @@
typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
/* Kernel blocking parameters */
- static const int out_width = 96;
- static const int k_unroll = 1;
+ static unsigned int out_width() {
+ return 96;
+ }
+
+ static unsigned int k_unroll() {
+ return 1;
+ }
kern_type kernel=a64_sgemv_trans;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
new file mode 100644
index 0000000..76f452d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp32_mla_4VLx4(const float *, int, const float *, float *, int, float, int, int, int);
+
+class hybrid_fp32_mla_4VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<float>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
+
+ hybrid_fp32_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..b8aa825
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
@@ -0,0 +1,2005 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const int K_stride = K;
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(float);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
+ const float *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "2:\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "2:\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "2:\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "mov z28.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z29.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z30.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z31.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "2:\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
index 3fd738e..2ca4ce2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,17 +41,17 @@
typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
/* Kernel blocking parameters */
- static int out_width()
+ static unsigned int out_width()
{
- return svcnth() * 3;
+ return get_vector_length<__fp16>() * 3;
}
- static int out_height()
+ static unsigned int out_height()
{
return 8;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
index 92ec888..517895c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,24 +48,24 @@
"mov z8.h, #0\n"
"ptrue p0.h\n"
"mov z9.h, #0\n"
- "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
"mov z10.h, #0\n"
- "ld1h z2.h, p0/z, [%[b_ptr]]\n"
"mov z11.h, #0\n"
- "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z12.h, #0\n"
- "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
"mov z13.h, #0\n"
- "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
+ "ld1h z2.h, p0/z, [%[b_ptr]]\n"
"mov z14.h, #0\n"
- "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
+ "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z15.h, #0\n"
- "add %[a_ptr], %[a_ptr], #0x20\n"
+ "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
"mov z16.h, #0\n"
- "addvl %[b_ptr], %[b_ptr], #6\n"
+ "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
"mov z17.h, #0\n"
+ "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
"mov z18.h, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
"mov z19.h, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
"mov z20.h, #0\n"
"mov z21.h, #0\n"
"mov z22.h, #0\n"
@@ -199,37 +199,31 @@
"fmla z30.h, z7.h, z1.h[6]\n"
"fmla z31.h, z7.h, z1.h[7]\n"
"fmla z8.h, z2.h, z0.h[0]\n"
- "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z9.h, z2.h, z0.h[1]\n"
"fmla z10.h, z2.h, z0.h[2]\n"
"fmla z11.h, z2.h, z0.h[3]\n"
"fmla z12.h, z2.h, z0.h[4]\n"
+ "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z13.h, z2.h, z0.h[5]\n"
"fmla z14.h, z2.h, z0.h[6]\n"
"fmla z15.h, z2.h, z0.h[7]\n"
"fmla z16.h, z3.h, z0.h[0]\n"
- "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z17.h, z3.h, z0.h[1]\n"
"fmla z18.h, z3.h, z0.h[2]\n"
"fmla z19.h, z3.h, z0.h[3]\n"
"fmla z20.h, z3.h, z0.h[4]\n"
+ "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z21.h, z3.h, z0.h[5]\n"
"fmla z22.h, z3.h, z0.h[6]\n"
"fmla z23.h, z3.h, z0.h[7]\n"
"fmla z24.h, z4.h, z0.h[0]\n"
- "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z25.h, z4.h, z0.h[1]\n"
- "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z26.h, z4.h, z0.h[2]\n"
- "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z27.h, z4.h, z0.h[3]\n"
- "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.h, z4.h, z0.h[4]\n"
- "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.h, z4.h, z0.h[5]\n"
- "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.h, z4.h, z0.h[6]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"b 4f\n"
"3:\n"
@@ -260,39 +254,39 @@
"fmla z30.h, z4.h, z0.h[6]\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"fmla z8.h, z5.h, z1.h[0]\n"
- "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z9.h, z5.h, z1.h[1]\n"
"fmla z10.h, z5.h, z1.h[2]\n"
"fmla z11.h, z5.h, z1.h[3]\n"
"fmla z12.h, z5.h, z1.h[4]\n"
+ "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z13.h, z5.h, z1.h[5]\n"
"fmla z14.h, z5.h, z1.h[6]\n"
"fmla z15.h, z5.h, z1.h[7]\n"
"fmla z16.h, z6.h, z1.h[0]\n"
- "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z17.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z1.h[2]\n"
"fmla z19.h, z6.h, z1.h[3]\n"
"fmla z20.h, z6.h, z1.h[4]\n"
+ "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z21.h, z6.h, z1.h[5]\n"
"fmla z22.h, z6.h, z1.h[6]\n"
"fmla z23.h, z6.h, z1.h[7]\n"
"fmla z24.h, z7.h, z1.h[0]\n"
- "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z25.h, z7.h, z1.h[1]\n"
- "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z26.h, z7.h, z1.h[2]\n"
- "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z27.h, z7.h, z1.h[3]\n"
- "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.h, z7.h, z1.h[4]\n"
- "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.h, z7.h, z1.h[5]\n"
- "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.h, z7.h, z1.h[6]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.h, z7.h, z1.h[7]\n"
"4:\n"
+ "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
index b2327f3..8c1fe6d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,17 +41,17 @@
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
/* Kernel blocking parameters */
- static int out_width()
+ static unsigned int out_width()
{
- return svcntw() * 3;
+ return get_vector_length<float>() * 3;
}
- static int out_height()
+ static unsigned int out_height()
{
return 8;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
index bb08fc7..88c9840 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,22 +48,22 @@
"mov z8.s, #0\n"
"ptrue p0.s\n"
"mov z9.s, #0\n"
- "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
"mov z10.s, #0\n"
- "ld1w z4.s, p0/z, [%[b_ptr]]\n"
"mov z11.s, #0\n"
- "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
"mov z12.s, #0\n"
- "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
"mov z13.s, #0\n"
- "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr]]\n"
"mov z14.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
"mov z15.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #3\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z16.s, #0\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
"mov z17.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -204,37 +204,31 @@
"fmla z31.s, z6.s, z3.s[3]\n"
"ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"fmla z8.s, z4.s, z0.s[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z9.s, z4.s, z0.s[1]\n"
"fmla z10.s, z4.s, z0.s[2]\n"
"fmla z11.s, z4.s, z0.s[3]\n"
"fmla z20.s, z4.s, z1.s[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z21.s, z4.s, z1.s[1]\n"
"fmla z22.s, z4.s, z1.s[2]\n"
"fmla z23.s, z4.s, z1.s[3]\n"
"fmla z12.s, z5.s, z0.s[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z13.s, z5.s, z0.s[1]\n"
"fmla z14.s, z5.s, z0.s[2]\n"
"fmla z15.s, z5.s, z0.s[3]\n"
"fmla z24.s, z5.s, z1.s[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z25.s, z5.s, z1.s[1]\n"
"fmla z26.s, z5.s, z1.s[2]\n"
"fmla z27.s, z5.s, z1.s[3]\n"
"fmla z16.s, z6.s, z0.s[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z17.s, z6.s, z0.s[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z18.s, z6.s, z0.s[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z19.s, z6.s, z0.s[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.s, z6.s, z1.s[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.s, z6.s, z1.s[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.s, z6.s, z1.s[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.s, z6.s, z1.s[3]\n"
"b 4f\n"
"3:\n"
@@ -269,39 +263,39 @@
"fmla z31.s, z6.s, z1.s[3]\n"
"ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"fmla z8.s, z4.s, z2.s[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z9.s, z4.s, z2.s[1]\n"
"fmla z10.s, z4.s, z2.s[2]\n"
"fmla z11.s, z4.s, z2.s[3]\n"
"fmla z20.s, z4.s, z3.s[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z21.s, z4.s, z3.s[1]\n"
"fmla z22.s, z4.s, z3.s[2]\n"
"fmla z23.s, z4.s, z3.s[3]\n"
"fmla z12.s, z5.s, z2.s[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z13.s, z5.s, z2.s[1]\n"
"fmla z14.s, z5.s, z2.s[2]\n"
"fmla z15.s, z5.s, z2.s[3]\n"
"fmla z24.s, z5.s, z3.s[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z25.s, z5.s, z3.s[1]\n"
"fmla z26.s, z5.s, z3.s[2]\n"
"fmla z27.s, z5.s, z3.s[3]\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z17.s, z6.s, z2.s[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z19.s, z6.s, z2.s[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.s, z6.s, z3.s[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.s, z6.s, z3.s[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.s, z6.s, z3.s[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.s, z6.s, z3.s[3]\n"
"4:\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
index 91aa567..cbb2138 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,17 +41,17 @@
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width()
+ static unsigned int out_width()
{
- return svcntw() * 3;
+ return get_vector_length<int32_t>() * 3;
}
- static int out_height()
+ static unsigned int out_height()
{
return 8;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
index 2e994a1..d679c21 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,22 +49,22 @@
"mov z8.s, #0\n"
"ptrue p0.b\n"
"mov z9.s, #0\n"
- "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
"mov z10.s, #0\n"
- "ld1b z4.b, p0/z, [%[b_ptr]]\n"
"mov z11.s, #0\n"
- "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
"mov z12.s, #0\n"
- "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
"mov z13.s, #0\n"
- "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
"mov z14.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
"mov z15.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #3\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z16.s, #0\n"
+ "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
"mov z17.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -205,37 +205,31 @@
"sdot z31.s, z6.b, z3.b[3]\n"
"ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"sdot z8.s, z4.b, z0.b[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z9.s, z4.b, z0.b[1]\n"
"sdot z10.s, z4.b, z0.b[2]\n"
"sdot z11.s, z4.b, z0.b[3]\n"
"sdot z20.s, z4.b, z1.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z21.s, z4.b, z1.b[1]\n"
"sdot z22.s, z4.b, z1.b[2]\n"
"sdot z23.s, z4.b, z1.b[3]\n"
"sdot z12.s, z5.b, z0.b[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z13.s, z5.b, z0.b[1]\n"
"sdot z14.s, z5.b, z0.b[2]\n"
"sdot z15.s, z5.b, z0.b[3]\n"
"sdot z24.s, z5.b, z1.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z25.s, z5.b, z1.b[1]\n"
"sdot z26.s, z5.b, z1.b[2]\n"
"sdot z27.s, z5.b, z1.b[3]\n"
"sdot z16.s, z6.b, z0.b[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z17.s, z6.b, z0.b[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z18.s, z6.b, z0.b[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"sdot z19.s, z6.b, z0.b[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"sdot z28.s, z6.b, z1.b[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z29.s, z6.b, z1.b[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"sdot z30.s, z6.b, z1.b[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"sdot z31.s, z6.b, z1.b[3]\n"
"b 4f\n"
"3:\n"
@@ -270,39 +264,39 @@
"sdot z31.s, z6.b, z1.b[3]\n"
"ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"sdot z8.s, z4.b, z2.b[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z9.s, z4.b, z2.b[1]\n"
"sdot z10.s, z4.b, z2.b[2]\n"
"sdot z11.s, z4.b, z2.b[3]\n"
"sdot z20.s, z4.b, z3.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z21.s, z4.b, z3.b[1]\n"
"sdot z22.s, z4.b, z3.b[2]\n"
"sdot z23.s, z4.b, z3.b[3]\n"
"sdot z12.s, z5.b, z2.b[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z13.s, z5.b, z2.b[1]\n"
"sdot z14.s, z5.b, z2.b[2]\n"
"sdot z15.s, z5.b, z2.b[3]\n"
"sdot z24.s, z5.b, z3.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z25.s, z5.b, z3.b[1]\n"
"sdot z26.s, z5.b, z3.b[2]\n"
"sdot z27.s, z5.b, z3.b[3]\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z17.s, z6.b, z2.b[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"sdot z19.s, z6.b, z2.b[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"sdot z28.s, z6.b, z3.b[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z29.s, z6.b, z3.b[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"sdot z30.s, z6.b, z3.b[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"sdot z31.s, z6.b, z3.b[3]\n"
"4:\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
index ef457e4..99c039e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,17 +41,17 @@
typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width()
+ static unsigned int out_width()
{
- return svcntw() * 3;
+ return get_vector_length<uint32_t>() * 3;
}
- static int out_height()
+ static unsigned int out_height()
{
return 8;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
new file mode 100644
index 0000000..d7f9f20
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_fp32_mla_4VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+class native_fp32_mla_4VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<float>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_fp32_mla_4VLx4;
+
+ native_fp32_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..6e22566
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
@@ -0,0 +1,2066 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(float);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
+ const float *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + x0;
+ long ldbb = ldb * sizeof(float);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "2:\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z25.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z26.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z27.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "2:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z25.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z26.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z27.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z28.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "2:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
new file mode 100644
index 0000000..8b98358
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int);
+
+class native_s8s32_dot_4VLx4
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<int32_t>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_s8s32_dot_4VLx4;
+
+ native_s8s32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..9c02d95
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
@@ -0,0 +1,4632 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0);
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
+
+ for (int y=0; y<M; y+=4) {
+ const int8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(int8_t);
+
+ int32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(int32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
+ const int32_t *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ long odds = odds_count;
+ const int8_t *a_ptr0 = a_ptr0_base;
+ const int8_t *b_ptr0 = B + x0;
+ const int8_t *b_ptr1 = b_ptr0 + ldb;
+ const int8_t *b_ptr2 = b_ptr1 + ldb;
+ const int8_t *b_ptr3 = b_ptr2 + ldb;
+ long ldbb = ldb * sizeof(int8_t) * 4;
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "2:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z21.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z22.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z22.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z23.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z25.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z26.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z28.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
new file mode 100644
index 0000000..bcbd3d3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int);
+
+class native_u8u32_dot_4VLx4
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<uint32_t>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_u8u32_dot_4VLx4;
+
+ native_u8u32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..7d89948
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
@@ -0,0 +1,4632 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0u);
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
+
+ for (int y=0; y<M; y+=4) {
+ const uint8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(uint8_t);
+
+ uint32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
+ const uint32_t *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ long odds = odds_count;
+ const uint8_t *a_ptr0 = a_ptr0_base;
+ const uint8_t *b_ptr0 = B + x0;
+ const uint8_t *b_ptr1 = b_ptr0 + ldb;
+ const uint8_t *b_ptr2 = b_ptr1 + ldb;
+ const uint8_t *b_ptr3 = b_ptr2 + ldb;
+ long ldbb = ldb * sizeof(uint8_t) * 4;
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "2:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z21.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z22.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z22.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z23.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z25.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z26.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z28.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
new file mode 100644
index 0000000..06622d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_smallK_fp32_mla_1VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+class smallK_fp32_mla_1VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<float>() * 1;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_smallK_fp32_mla_1VLx4;
+
+ smallK_fp32_mla_1VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp
new file mode 100644
index 0000000..e2cc1d1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp
@@ -0,0 +1,4264 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_smallK_fp32_mla_1VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+
+ const long loops_count = M / 4;
+ const long oddrow_count = M % 4;
+ const long ldab = lda * sizeof(float);
+ const long ldcb = ldc * sizeof(float);
+ const long odd_depth = K % 4;
+ const float *betaptr = β
+ long ldbb = ldb * sizeof(float);
+
+ for (int x0=0; x0<N; x0+=(get_vector_length<float>() * 1)) {
+ const long width = std::min((unsigned long)N-x0, (get_vector_length<float>() * 1));
+ long loops = loops_count;
+ long oddrows = oddrow_count;
+ long temp = 0;
+ const float *b_ptr0 = B + x0;
+
+ const float *a_ptr0 = A;
+
+ float *c_ptr0 = C + x0;
+
+ switch(K) {
+ case 1:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 5:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 6:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 7:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 8:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 9:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 10:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 11:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 12:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 13:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 14:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 15:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 16:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 17:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 18:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 19:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 20:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 21:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 22:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 23:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z26.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ default:
+ case 24:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z26.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z27.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "fmla z28.s, z27.s, z0.s[3]\n"
+ "fmla z29.s, z27.s, z1.s[3]\n"
+ "fmla z30.s, z27.s, z2.s[3]\n"
+ "fmla z31.s, z27.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "fmla z28.s, z27.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
new file mode 100644
index 0000000..022efdf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_smallK_hybrid_fp32_mla_1VLx4(const float *, int, const float *, float *, int, float, int, int, int);
+
+class smallK_hybrid_fp32_mla_1VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<float>() * 1;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 1, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx4;
+
+ smallK_hybrid_fp32_mla_1VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp
new file mode 100644
index 0000000..3e7e713
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp
@@ -0,0 +1,4004 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_smallK_hybrid_fp32_mla_1VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+
+ const long loops_count = M / 4;
+ const long oddrow_count = M % 4;
+ const long ldab = lda * sizeof(float);
+ const long ldcb = ldc * sizeof(float);
+ const int K_stride = K;
+ const long odd_depth = K % 4;
+ const float *betaptr = β
+
+ for (int x0=0; x0<N; x0+=(get_vector_length<float>() * 1)) {
+ const long width = std::min((unsigned long)N-x0, (get_vector_length<float>() * 1));
+ long loops = loops_count;
+ long oddrows = oddrow_count;
+ long temp = 0;
+ const float *b_ptr0 = B + (K_stride * x0);
+
+ const float *a_ptr0 = A;
+
+ float *c_ptr0 = C + x0;
+
+ switch(K) {
+ case 1:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 5:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 6:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 7:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 8:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 9:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 10:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 11:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 12:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 13:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 14:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 15:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 16:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 17:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 18:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 19:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 20:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 21:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 22:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 23:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ default:
+ case 24:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z27.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "fmla z28.s, z27.s, z0.s[3]\n"
+ "fmla z29.s, z27.s, z1.s[3]\n"
+ "fmla z30.s, z27.s, z2.s[3]\n"
+ "fmla z31.s, z27.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "fmla z28.s, z27.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
new file mode 100644
index 0000000..fcdca59
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
@@ -0,0 +1,1660 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template<>
+inline void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+ const float *inptr = in;
+
+ for (int y=y0; y<ymax; y+=8) {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+ float *outptr6 = outptr5 + ldout;
+ float *outptr7 = outptr6 + ldout;
+
+ const int height = ymax - y;
+
+ for (int i=x0; i<xmax; i+=12) {
+ if (beta==0.0f)
+ {
+ switch(height) {
+ case 1:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr0], #0x10]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 2:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x10]\n"
+ "ldr q7, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x10]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 3:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr0], #0x10]\n"
+ "ldr q4, [%[inptr], #0x40]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr1], #0x10]\n"
+ "ldr q5, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr2], #0x10]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 4:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x10]\n"
+ "ldr q5, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x10]\n"
+ "ldr q6, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x10]\n"
+ "ldr q7, [%[inptr], #0xa0]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x10]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 5:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]);
+ outptr4++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q5, [%[inptr], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr0], #0x10]\n"
+ "ldr q6, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr1], #0x10]\n"
+ "ldr q7, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr2], #0x10]\n"
+ "ldr q4, [%[inptr], #0xa0]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr3], #0x10]\n"
+ "ldr q5, [%[inptr], #0xd0]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr4], #0x10]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "ldr q5, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x20]\n"
+ "ldr q6, [%[inptr], #0xe0]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 6:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]);
+ outptr5++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q6, [%[inptr], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x10]\n"
+ "ldr q7, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x10]\n"
+ "ldr q4, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x10]\n"
+ "ldr q5, [%[inptr], #0xa0]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x10]\n"
+ "ldr q6, [%[inptr], #0xd0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x10]\n"
+ "ldr q7, [%[inptr], #0x100]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr5], #0x10]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "ldr q4, [%[inptr], #0xe0]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x20]\n"
+ "ldr q5, [%[inptr], #0x110]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x20]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 7:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]);
+ outptr6++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q6, [%[inptr], #0x120]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6]]\n"
+ "ldr q7, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr0], #0x10]\n"
+ "ldr q4, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr1], #0x10]\n"
+ "ldr q5, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr2], #0x10]\n"
+ "ldr q6, [%[inptr], #0xa0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr3], #0x10]\n"
+ "ldr q7, [%[inptr], #0xd0]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr4], #0x10]\n"
+ "ldr q4, [%[inptr], #0x100]\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr5], #0x10]\n"
+ "ldr q5, [%[inptr], #0x130]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr6], #0x10]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "ldr q5, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x20]\n"
+ "ldr q6, [%[inptr], #0xe0]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x20]\n"
+ "ldr q7, [%[inptr], #0x110]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr5], #0x20]\n"
+ "ldr q4, [%[inptr], #0x140]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr6], #0x20]\n"
+ "add %[outptr6], %[outptr6], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ default:
+ case 8:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q6, [%[inptr], #0x120]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6]]\n"
+ "ldr q7, [%[inptr], #0x150]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7]]\n"
+ "ldr q4, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x10]\n"
+ "ldr q5, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x10]\n"
+ "ldr q6, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x10]\n"
+ "ldr q7, [%[inptr], #0xa0]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x10]\n"
+ "ldr q4, [%[inptr], #0xd0]\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x10]\n"
+ "ldr q5, [%[inptr], #0x100]\n"
+ "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x10]\n"
+ "ldr q6, [%[inptr], #0x130]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6], #0x10]\n"
+ "ldr q7, [%[inptr], #0x160]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7], #0x10]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "ldr q4, [%[inptr], #0xe0]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x20]\n"
+ "ldr q5, [%[inptr], #0x110]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x20]\n"
+ "ldr q6, [%[inptr], #0x140]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6], #0x20]\n"
+ "ldr q7, [%[inptr], #0x170]\n"
+ "add %[outptr6], %[outptr6], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7], #0x20]\n"
+ "add %[outptr7], %[outptr7], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+
+ }
+ }
+ else
+ {
+ switch(height) {
+ case 1:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x10]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr0], #0x10]\n"
+ "ldr q10, [%[outptr0], #0x20]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 2:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x10]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x10]\n"
+ "ldr q11, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x40]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x10]\n"
+ "ldr q8, [%[outptr0], #0x20]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q9, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 3:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x10]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr0], #0x10]\n"
+ "ldr q8, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x40]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr1], #0x10]\n"
+ "ldr q9, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x70]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr2], #0x10]\n"
+ "ldr q10, [%[outptr0], #0x20]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q11, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q8, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 4:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x10]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x10]\n"
+ "ldr q9, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x40]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x10]\n"
+ "ldr q10, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x70]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x10]\n"
+ "ldr q11, [%[outptr3], #0x10]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xa0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x10]\n"
+ "ldr q8, [%[outptr0], #0x20]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q9, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q10, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q11, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 5:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q9, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x10]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr0], #0x10]\n"
+ "ldr q10, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x40]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr1], #0x10]\n"
+ "ldr q11, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x70]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr2], #0x10]\n"
+ "ldr q8, [%[outptr3], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xa0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr3], #0x10]\n"
+ "ldr q9, [%[outptr4], #0x10]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xd0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr4], #0x10]\n"
+ "ldr q10, [%[outptr0], #0x20]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q11, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q8, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "ldr q9, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xb0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x20]\n"
+ "ldr q10, [%[outptr4], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0xe0]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 6:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q9, [%[outptr5]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q10, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x10]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x10]\n"
+ "ldr q11, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x40]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x10]\n"
+ "ldr q8, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x70]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x10]\n"
+ "ldr q9, [%[outptr3], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xa0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x10]\n"
+ "ldr q10, [%[outptr4], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0xd0]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x10]\n"
+ "ldr q11, [%[outptr5], #0x10]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x100]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr5], #0x10]\n"
+ "ldr q8, [%[outptr0], #0x20]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q9, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q10, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q11, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "ldr q8, [%[outptr4], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xe0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x20]\n"
+ "ldr q9, [%[outptr5], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x110]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x20]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 7:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q9, [%[outptr5]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q10, [%[outptr6]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x120]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6]]\n"
+ "ldr q11, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x10]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr0], #0x10]\n"
+ "ldr q8, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x40]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr1], #0x10]\n"
+ "ldr q9, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x70]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr2], #0x10]\n"
+ "ldr q10, [%[outptr3], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0xa0]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr3], #0x10]\n"
+ "ldr q11, [%[outptr4], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xd0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr4], #0x10]\n"
+ "ldr q8, [%[outptr5], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x100]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr5], #0x10]\n"
+ "ldr q9, [%[outptr6], #0x10]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x130]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr6], #0x10]\n"
+ "ldr q10, [%[outptr0], #0x20]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q11, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q8, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "ldr q9, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xb0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x20]\n"
+ "ldr q10, [%[outptr4], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0xe0]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x20]\n"
+ "ldr q11, [%[outptr5], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x110]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr5], #0x20]\n"
+ "ldr q8, [%[outptr6], #0x20]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x140]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr6], #0x20]\n"
+ "add %[outptr6], %[outptr6], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ default:
+ case 8:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q9, [%[outptr5]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q10, [%[outptr6]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x120]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6]]\n"
+ "ldr q11, [%[outptr7]]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x150]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7]]\n"
+ "ldr q8, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x10]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x10]\n"
+ "ldr q9, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x40]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x10]\n"
+ "ldr q10, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x70]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x10]\n"
+ "ldr q11, [%[outptr3], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xa0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x10]\n"
+ "ldr q8, [%[outptr4], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xd0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x10]\n"
+ "ldr q9, [%[outptr5], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x100]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x10]\n"
+ "ldr q10, [%[outptr6], #0x10]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x130]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6], #0x10]\n"
+ "ldr q11, [%[outptr7], #0x10]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x160]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7], #0x10]\n"
+ "ldr q8, [%[outptr0], #0x20]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q9, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q10, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q11, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "ldr q8, [%[outptr4], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xe0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x20]\n"
+ "ldr q9, [%[outptr5], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x110]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x20]\n"
+ "ldr q10, [%[outptr6], #0x20]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x140]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6], #0x20]\n"
+ "ldr q11, [%[outptr7], #0x20]\n"
+ "add %[outptr6], %[outptr6], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x170]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7], #0x20]\n"
+ "add %[outptr7], %[outptr7], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+
+ }
+ }
+ }
+ }
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/ndrange.hpp b/src/core/NEON/kernels/arm_gemm/ndrange.hpp
new file mode 100644
index 0000000..20824df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/ndrange.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+
+namespace arm_gemm {
+
+template<unsigned int D>
+class NDRange {
+private:
+ unsigned int m_sizes[D];
+ unsigned int m_totalsizes[D];
+
+ class NDRangeIterator {
+ private:
+ const NDRange &m_parent;
+ unsigned int m_pos = 0;
+ unsigned int m_end = 0;
+
+ public:
+ NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { }
+
+ bool done() const {
+ return (m_pos >= m_end);
+ }
+
+ unsigned int dim(unsigned int d) const {
+ unsigned int r = m_pos;
+
+ if (d < (D - 1)) {
+ r %= m_parent.m_totalsizes[d];
+ }
+
+ if (d > 0) {
+ r /= m_parent.m_totalsizes[d-1];
+ }
+
+ return r;
+ }
+
+ bool next_dim0() {
+ m_pos++;
+
+ return !done();
+ }
+
+ bool next_dim1() {
+ m_pos += m_parent.m_sizes[0] - dim(0);
+
+ return !done();
+ }
+
+ unsigned int dim0_max() const {
+ unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
+
+ return dim(0) + offset;
+ }
+ };
+
+public:
+ template <typename... T>
+ NDRange(T... ts) : m_sizes{ts...} {
+ unsigned int t=1;
+
+ for (unsigned int i=0; i<D; i++) {
+ t *= m_sizes[i];
+
+ m_totalsizes[i] = t;
+ }
+ }
+
+ NDRangeIterator iterator(unsigned int start, unsigned int end) const {
+ return NDRangeIterator(*this, start, end);
+ }
+
+ unsigned int total_size() const {
+ return m_totalsizes[D - 1];
+ }
+
+ unsigned int get_size(unsigned int v) const {
+ return m_sizes[v];
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index e422b91..0330783 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,11 +81,14 @@
}
}
// "row" tail - row is out of range so fill with zeros always.
- for (int row = 0; row < blank_rows; row++) {
- for (int col=0; col < (fill_cols + blank_cols); col++) {
- *out++ = static_cast<TOut>(0);
- }
+ TOut zeroval = static_cast<TOut>(0);
+ int pads = blank_rows * (fill_cols + blank_cols);
+
+ for (int i=0; i<pads; i++) {
+ out[i] = zeroval;
}
+
+ out += pads;
}
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
index 347eafb..0648ff6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
#include <arm_neon.h>
@@ -173,4 +173,4 @@
}
}
-#endif // __aarch64__
+#endif // __aarch64__ && !__ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index fc1f2c2..e1ebba0 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,17 +23,14 @@
*/
#include "a32_interleave_6way_32bit.hpp"
#include "a32_transpose_interleave_8way_32bit.hpp"
-#ifdef __ARM_FEATURE_SVE
-#include "sve_interleave_8way_32bit.hpp"
-#include "sve_interleave_8way_block2_32bit.hpp"
-#include "sve_interleave_8way_block4_8bit.hpp"
-#else
-#include "a64_interleave_8way_32bit.hpp"
-#endif
#include "a64_block16_interleave4_8bit.hpp"
#include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_32bit.hpp"
#include "a64_interleave_8way_half_to_float.hpp"
#include "a64_transpose_interleave_12way_16bit.hpp"
#include "a64_transpose_interleave_12way_half_to_float.hpp"
#include "a64_transpose_interleave_24way_16bit.hpp"
-#include "transpose_interleave_common.hpp"
+#include "sve_interleave_8way_32bit.hpp"
+#include "sve_interleave_8way_block2_32bit.hpp"
+#include "sve_interleave_8way_block4_8bit.hpp"
+#include "transpose_interleave_common.hpp"
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
index 752e837..07c8219 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,7 +41,7 @@
long outpos = 0;
uint32_t *outptr = master_outptr;
- master_outptr += outwidth;
+ master_outptr += (outwidth * 1);
const uint32_t *inptr0 = inptr + y * ldin + k0;
const uint32_t *inptr1 = inptr0 + ldin;
@@ -60,52 +60,53 @@
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
"incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
"zip2 z9.s, z0.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip1 z0.s, z8.s, z4.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z4.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z4.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
"st1w z8.s, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z4.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
"st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z15.s, z3.s, z4.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "zip1 z10.s, z1.s, z4.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip2 z11.s, z1.s, z4.s\n"
"st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "zip1 z12.s, z2.s, z4.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip2 z13.s, z2.s, z4.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z4.s\n"
"st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z4.s\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -115,60 +116,62 @@
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "mov z14.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
"zip1 z8.s, z0.s, z4.s\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "incw %[inpos], all, mul #1\n"
"zip1 z10.s, z1.s, z4.s\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip2 z11.s, z1.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip1 z0.s, z8.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z4.s\n"
- "zip1 z2.s, z9.s, z4.s\n"
- "zip2 z3.s, z9.s, z4.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip1 z2.s, z9.s, z4.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "mov z14.s, #0\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip1 z6.s, z11.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z7.s, z11.s, z14.s\n"
"zip1 z8.s, z0.s, z4.s\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z11.s, z1.s, z5.s\n"
"st1w z8.s, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z15.s, z3.s, z7.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
"st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z5.s\n"
"st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
"st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -178,63 +181,66 @@
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "mov z14.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
"zip1 z10.s, z1.s, z4.s\n"
+ "incw %[inpos], all, mul #1\n"
"zip2 z11.s, z1.s, z4.s\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip1 z12.s, z2.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip2 z13.s, z2.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z4.s, z10.s, z14.s\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
+ "mov z14.s, #0\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip1 z6.s, z11.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z7.s, z11.s, z14.s\n"
"zip1 z8.s, z0.s, z4.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
"st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip1 z14.s, z3.s, z7.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
"st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "incw %[outpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z5.s\n"
"st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
"st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -244,65 +250,69 @@
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
"zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
"zip1 z10.s, z1.s, z4.s\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
"zip2 z11.s, z1.s, z4.s\n"
+ "incw %[inpos], all, mul #1\n"
"zip1 z12.s, z2.s, z4.s\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip2 z13.s, z2.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip1 z14.s, z3.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
"zip2 z15.s, z3.s, z4.s\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
"whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
"st1w z8.s, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
"st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "zip1 z10.s, z1.s, z5.s\n"
"st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
"st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -312,66 +322,71 @@
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z5.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"incw %[inpos], all, mul #1\n"
"zip1 z10.s, z1.s, z5.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
"zip2 z11.s, z1.s, z5.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "zip2 z9.s, z0.s, z4.s\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
"zip1 z12.s, z2.s, z5.s\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip2 z13.s, z2.s, z5.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
"zip1 z14.s, z3.s, z5.s\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip2 z15.s, z3.s, z5.s\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
"whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
"st1w z8.s, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
"st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "zip1 z10.s, z1.s, z5.s\n"
"st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
"st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -381,67 +396,73 @@
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z6.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"incw %[inpos], all, mul #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip1 z12.s, z2.s, z6.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
"zip2 z13.s, z2.s, z6.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "ld1w z5.s, p0/z, [%[inptr5]]\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z14.s, z3.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
"zip2 z15.s, z3.s, z6.s\n"
+ "addvl %[inptr5], %[inptr5], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
"whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
"st1w z8.s, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
"st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "zip1 z10.s, z1.s, z5.s\n"
"st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
"st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -451,68 +472,75 @@
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z7.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"incw %[inpos], all, mul #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip1 z14.s, z3.s, z7.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z5.s, p0/z, [%[inptr5]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "ld1w z6.s, p0/z, [%[inptr6]]\n"
"zip1 z10.s, z1.s, z5.s\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
"zip2 z11.s, z1.s, z5.s\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z12.s, z2.s, z6.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
"zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr5], %[inptr5], #1\n"
"zip2 z15.s, z3.s, z7.s\n"
+ "addvl %[inptr6], %[inptr6], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
"whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
"st1w z8.s, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
"st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "zip1 z10.s, z1.s, z5.s\n"
"st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
"st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -522,69 +550,77 @@
"1:\n"
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
- "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
"incw %[inpos], all, mul #1\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z8.s, z0.s, z4.s\n"
+ "ld1w z5.s, p0/z, [%[inptr5]]\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "ld1w z6.s, p0/z, [%[inptr6]]\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "ld1w z7.s, p0/z, [%[inptr7]]\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "addvl %[inptr5], %[inptr5], #1\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "addvl %[inptr6], %[inptr6], #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "addvl %[inptr7], %[inptr7], #1\n"
+ "zip2 z15.s, z3.s, z7.s\n"
"whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z3.s, z9.s, z13.s\n"
- "incw %[outpos], all, mul #1\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z5.s, z10.s, z14.s\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
"whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z7.s, z11.s, z15.s\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
"st1w z8.s, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
"st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "zip1 z10.s, z1.s, z5.s\n"
"st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
"st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index a1fc00e..f070780 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-#include <arm_sve.h>
-#endif
+#include <cstddef>
// Macro for unreachable code (e.g. impossible default cases on switch)
#define UNREACHABLE(why) __builtin_unreachable()
@@ -34,7 +32,8 @@
// Paranoid option for the above with assert
// #define UNREACHABLE(why) assert(0 && why)
-inline int iceildiv(const int a, const int b) {
+template<typename T>
+inline T iceildiv(const T a, const T b) {
return (a + b - 1) / b;
}
@@ -49,13 +48,43 @@
}
}
+namespace arm_gemm {
+namespace utils {
+namespace {
+
+#ifdef __ARM_FEATURE_SVE
+template<size_t sz>
+inline unsigned long get_vector_length_sz() {
+ unsigned long v;
+
+ __asm (
+ "cntb %0"
+ : "=r" (v)
+ );
+
+ return v / sz;
+}
+
+#define VEC_LEN_SPEC(sz, opcode) template <> inline unsigned long get_vector_length_sz<sz>() { unsigned long v; __asm ( opcode " %0" : "=r" (v)); return v; }
+
+VEC_LEN_SPEC(8, "cntd")
+VEC_LEN_SPEC(4, "cntw")
+VEC_LEN_SPEC(2, "cnth")
+VEC_LEN_SPEC(1, "cntb")
+#endif
+
+} // anonymous namespace
+
template <typename T>
inline unsigned long get_vector_length() {
#ifdef __ARM_FEATURE_SVE
- const unsigned long length = svcntb();
+ return get_vector_length_sz<sizeof(T)>();
#else
- const unsigned long length = 16;
+ return 16 / sizeof(T);
#endif
+}
- return length / sizeof(T);
-}
\ No newline at end of file
+} // utils namespace
+} // arm_gemm namespace
+
+using namespace arm_gemm::utils;
\ No newline at end of file
diff --git a/src/core/NEON/kernels/assembly/Helpers.cpp b/src/core/NEON/kernels/assembly/Helpers.cpp
index 09ac08c..3d8d66d 100644
--- a/src/core/NEON/kernels/assembly/Helpers.cpp
+++ b/src/core/NEON/kernels/assembly/Helpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,91 +24,47 @@
#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "NEGEMMInterleavedStrategies.h"
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
namespace arm_compute
{
-namespace
-{
-template <typename InputType, bool use_dot = false>
-BlockSizes calculate_block_sizes_template(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K)
-{
- using strategy = typename Kernel<InputType, use_dot>::strategy;
- return calculate_block_sizes<strategy>(ci, M, N, K);
-}
-} // namespace
-
-const char *get_strategy_name(DataType input_type, bool use_dot)
+arm_gemm::KernelDescription get_gemm_info(DataType input_type,
+ const CPUInfo &ci,
+ const unsigned int num_threads,
+ const INEGEMMWrapperKernel::Params &p,
+ float alpha,
+ float beta,
+ bool pretranspose_hint)
{
switch(input_type)
{
- case DataType::F32:
- return Kernel<float>::name;
#ifdef __aarch64__
- case DataType::U8:
case DataType::QASYMM8:
- if(use_dot)
- {
- return Kernel<uint8_t, true>::name;
- }
- else
- {
- return Kernel<uint8_t, false>::name;
- }
+ case DataType::U8:
+ {
+ arm_gemm::GemmArgs<uint32_t> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+ return arm_gemm::get_gemm_method<uint8_t, uint32_t>(args);
+ }
case DataType::S8:
- if(use_dot)
- {
- return Kernel<int8_t, true>::name;
- }
- else
- {
- return Kernel<int8_t, false>::name;
- }
-#endif /* __aarch64__ */
+ {
+ arm_gemm::GemmArgs<int32_t> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+ return arm_gemm::get_gemm_method<int8_t, int32_t>(args);
+ }
+#endif // __aarch64__
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Kernel<__fp16>::name;
+ {
+ arm_gemm::GemmArgs<__fp16> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+ return arm_gemm::get_gemm_method<__fp16, __fp16>(args);
+ }
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
- }
-}
-
-BlockSizes calculate_block_sizes_from_data_type(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K, DataType input_type, bool use_dot)
-{
- switch(input_type)
- {
case DataType::F32:
- return calculate_block_sizes_template<float>(ci, M, N, K);
-#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- return calculate_block_sizes_template<uint8_t, true>(ci, M, N, K);
- }
- else
- {
- return calculate_block_sizes_template<uint8_t, false>(ci, M, N, K);
- }
- case DataType::S8:
- if(use_dot)
- {
- return calculate_block_sizes_template<int8_t, true>(ci, M, N, K);
- }
- else
- {
- return calculate_block_sizes_template<int8_t, false>(ci, M, N, K);
- }
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return calculate_block_sizes_template<__fp16>(ci, M, N, K);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ {
+ arm_gemm::GemmArgs<float> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+ return arm_gemm::get_gemm_method<float, float>(args);
+ }
default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
+ return arm_gemm::KernelDescription();
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
deleted file mode 100644
index 2c9cd32..0000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-
-#include "NEGEMMInterleavedStrategies.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/WindowIterator.h"
-
-namespace arm_compute
-{
-template <typename To, typename Tr, bool use_dot>
-void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
- const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params ¶ms, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- _prepared_a = prepared_a;
- _transformed_b = transformed_b;
- _tmp_c = tmp_c;
- _c = c;
- _block_walker = block_walker;
- _block_sizes = block_sizes;
- _params = params;
- _b_is_pretransposed = b_is_pretransposed;
- _alpha = alpha;
- _beta = beta;
-
- auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads }));
-}
-
-template <typename To, typename Tr, bool use_dot>
-void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset,
- const Coordinates &end_offset)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- strategy strat(info.cpu_info);
- TensorAccessor<To> prepared_a(*_prepared_a);
- TensorAccessor<To> transformed_b(*_transformed_b);
- TensorAccessor<Tr> c(*_c);
- TensorAccessor<Tr> tmp_c(*_tmp_c);
-
- int prev_batch = -1;
- To *a_ptr = nullptr;
- auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
- {
- const unsigned int y = id.x();
- const unsigned int batch = id.y();
- const unsigned int ymax = std::min(_params.M, y + strategy::out_height());
-
- // If it's the first block of a new batch then reset the pointer to A.
- if(prev_batch != static_cast<int>(batch))
- {
- const unsigned int first_m = id.x();
- a_ptr = prepared_a(0, first_m, batch);
- prev_batch = batch;
- }
-
- // Call matrix multiply assembly routine to process the block:
- strat.kernel(a_ptr, transformed_b(wl._offset_transformed_b), tmp_c(0, info.thread_id), 1, wl._bblocks, wl._kern_k);
- a_ptr += strategy::out_height() * wl._kern_k;
-
- // Merge the result with the other blocks' results:
- strat.transforms.Merge(c(0, 0, batch, wl._multi), tmp_c(0, info.thread_id), c.stride(1), y, ymax, wl._x0, wl._xmax, _alpha, (wl._k0 == 0 ? _beta : static_cast<Tr>(1)));
- });
- auto on_new_row_size = [&](unsigned int start, unsigned int end)
- {
- //Nothing to do
- };
- window_iterator.iterate_2D(on_new_row_size);
-}
-
-template <typename To, typename Tr, bool use_dot>
-void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::create_workloads(std::vector<MatrixMultiplyWorkload> &workloads)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- unsigned int offset_transformed_b = 0;
- execute_window_loop(_block_walker, [&](const Coordinates & id)
- {
- const unsigned int x0 = id.x();
- const unsigned int k0 = id.y();
- const unsigned int multi = id.z();
-
- const unsigned int xmax = std::min(x0 + _block_walker.x().step(), _params.N);
- const unsigned int kmax = std::min(k0 + _block_walker.y().step(), _params.K);
-
- // Figure out how many "K" the kernel will actually process.
- const int kern_k = ceil_to_multiple(kmax - k0, strategy::k_unroll());
- const int bblocks = DIV_CEIL(xmax - x0, strategy::out_width());
-
- workloads.push_back(MatrixMultiplyWorkload(offset_transformed_b, x0, xmax, k0, kmax, multi, kern_k, bblocks));
-
- if(_b_is_pretransposed)
- {
- offset_transformed_b += bblocks * strategy::out_width() * kern_k;
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
- });
-}
-
-//TODO: regroup somewhere ?
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<float, float>;
-#ifdef __aarch64__
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<uint8_t, uint32_t>;
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<int8_t, int32_t>;
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<uint8_t, uint32_t, true>;
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<int8_t, int32_t, true>;
-#endif /* __aarch64__ */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<float16_t, float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
deleted file mode 100644
index 6c201ce..0000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-
-#include "NEGEMMInterleavedStrategies.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-namespace
-{
-// Call the lambda function for each workload generated by the passed window.
-template <typename To, bool use_dot, typename Lambda>
-void for_each_element_in_window(const Window &window, const ITensor *b, ITensor *transformed_b, unsigned int N, unsigned int K, Lambda &&lambda)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- unsigned int offset_transformed_b = transformed_b->info()->offset_first_element_in_bytes();
- execute_window_loop(window, [&](const Coordinates & coordinates)
- {
- const unsigned int x0 = coordinates.x();
- const unsigned int k0 = coordinates.y();
- const unsigned int multi = coordinates.z();
-
- const unsigned int offset_b = b->info()->offset_element_in_bytes(Coordinates(0, 0, multi));
- const unsigned int xmax = std::min(x0 + window.x().step(), N);
- const unsigned int kmax = std::min(k0 + window.y().step(), K);
-
- /* Figure out the size of each block. */
- unsigned int x_size = (xmax - x0);
- unsigned int k_size = (kmax - k0);
-
- /* Round sizes up as needed. */
- x_size = ceil_to_multiple(x_size, strategy::out_width());
- k_size = ceil_to_multiple(k_size, strategy::k_unroll());
-
- lambda(PrepareBWorkload(offset_b, offset_transformed_b, x0, xmax, k0, kmax));
-
- //Each workload represents one block:
- offset_transformed_b += (x_size * k_size * sizeof(To));
- });
-}
-
-// Calculate the size of transformed_b:
-template <typename To, bool use_dot>
-unsigned int get_B_pretransposed_array_size(unsigned int N, unsigned int K, const BlockSizes &bs)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- // How many full blocks do N / K contain ?
- size_t num_full_k = K / bs.k_block;
- size_t num_full_x = N / bs.x_block;
-
- ARM_COMPUTE_ERROR_ON(bs.x_block % strategy::out_width() != 0);
- ARM_COMPUTE_ERROR_ON(bs.k_block % strategy::k_unroll() != 0);
-
- size_t normal_x_size = bs.x_block;
- size_t normal_k_size = bs.k_block;
-
- // Round up the leftovers to be a multiple of the strategy processing size:
- size_t left_over_x_size = ceil_to_multiple(N % bs.x_block, strategy::out_width());
- size_t left_over_k_size = ceil_to_multiple(K % bs.k_block, strategy::k_unroll());
-
- // Calculate the total size of the buffer:
- size_t total = num_full_k * normal_k_size * (num_full_x * normal_x_size + left_over_x_size);
- total += left_over_k_size * (left_over_x_size + num_full_x * normal_x_size);
- return total;
-}
-
-} // namespace
-
-template <typename To, bool use_dot>
-BlockSizes NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::block_sizes() const
-{
- return _block_sizes;
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::configure(const ITensor *b, ITensor *transformed_b, bool transpose_b, const CPUInfo &ci, const INEGEMMWrapperKernel::Params ¶ms)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- const unsigned int multis = b->info()->tensor_shape().z();
- _Nsize = b->info()->tensor_shape().x();
- _Ksize = b->info()->tensor_shape().y();
- _b = b;
- _transformed_b = transformed_b;
- _transpose_b = transpose_b;
-
- _block_sizes = calculate_block_sizes<strategy>(ci, params.M, params.N, params.K);
-
- auto_init_if_empty(*transformed_b->info(), b->info()->clone()->set_tensor_shape(TensorShape{ get_B_pretransposed_array_size<To, use_dot>(_Nsize, _Ksize, _block_sizes) }));
-
- Window window;
- window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_Nsize, _block_sizes.x_block), _block_sizes.x_block));
- window.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_Ksize, _block_sizes.k_block), _block_sizes.k_block));
- window.set(Window::DimZ, Window::Dimension(0, multis));
-
- INEKernel::configure(window);
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::transform(const PrepareBWorkload &wl, const ThreadInfo &info)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- strategy strat(info.cpu_info);
- strat.transforms.PrepareB(reinterpret_cast<To *>(_transformed_b->buffer() + wl._offset_transformed_b),
- reinterpret_cast<To *>(_b->buffer() + wl._offset_b),
- _b->info()->strides_in_bytes().y() / sizeof(To),
- wl._x0, wl._xmax, wl._k0, wl._kmax, _transpose_b);
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::create_workloads(std::vector<PrepareBWorkload> &workloads)
-{
- for_each_element_in_window<To, use_dot>(window(), _b, _transformed_b, _Nsize, _Ksize, [&workloads](PrepareBWorkload && wl)
- {
- workloads.push_back(std::move(wl));
- });
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(window, INEKernel::window());
- for_each_element_in_window<To, use_dot>(window, _b, _transformed_b, _Nsize, _Ksize, [&](PrepareBWorkload && wl)
- {
- this->transform(wl, info);
- });
-}
-
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<float>;
-#ifdef __aarch64__
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<uint8_t>;
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<int8_t>;
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<uint8_t, true>;
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<int8_t, true>;
-#endif /* __aarch64__ */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
index 69842fe..26d9e99 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,90 +44,184 @@
namespace arm_compute
{
-namespace
+namespace detail
{
-template <typename To, bool use_dot = false>
-struct Kernel
+/** GEMM Interleaved Strategy interface */
+class IInterleavedStrategy
{
+public:
+ /** Virtual Destructor */
+ virtual ~IInterleavedStrategy() = default;
+ /** Return output height of the interleaved strategy
+ *
+ * @return Output height of strategy
+ */
+ virtual unsigned int out_height() const = 0;
+ /** Instantiate and configure a prepareB Kernel
+ *
+ * @param[in] b Input tensor B.
+ * @param[in] transformed_b Reshaped tensor B.
+ * @param[in] params GM, N, K sizes.
+ * @param[in] ci CPUInfo to be used for kernel configuration.
+ *
+ * @return A wrapped specialized prepareB kernel
+ */
+ virtual std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b,
+ ITensor *transformed_b,
+ const INEGEMMWrapperKernel::Params ¶ms,
+ const CPUInfo &ci) = 0;
+ /** Instantiate and configure a transformA Kernel
+ *
+ * @param[in] a Input tensor A.
+ * @param[in] transformed_a Reshaped tensor A.
+ * @param[in] block_walker Window representing the layout of the matrix's blocks.
+ * @param[in] params M, N, K sizes.
+ *
+ * @return A wrapped specialized transformA kernel
+ */
+ virtual std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a,
+ ITensor *transformed_a,
+ const Window &block_walker,
+ const INEGEMMWrapperKernel::Params ¶ms) = 0;
+ /** Instantiate and configure a prepareB Kernel
+ *
+ * @param transformed_a Already reshaped tensor A.
+ * @param transformed_b Already reshaped tensor B.
+ * @param tmp_c Temporary buffer to be used to store intermediate results.
+ * @param c Result tensor C.
+ * @param block_walker Window containing iteration information for the M and batch dimensions.
+ * @param block_sizes Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes).
+ * @param params M, N, K sizes.
+ * @param alpha Alpha value
+ * @param beta Beta value
+ * @param pretranspose_b Is B also pretransposed ?
+ * @param num_threads Maximum number of threads that might be used for the calculations.
+ *
+ * @return A wrapped specialized MatrixMultiply kernel
+ */
+ virtual std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c,
+ const Window &block_walker, const BlockSizes &block_sizes,
+ const INEGEMMWrapperKernel::Params ¶ms, float alpha, float beta, bool pretranspose_b,
+ unsigned int num_threads) = 0;
+ /** Calculates the block sizes of a given strategy
+ *
+ * @param[in] ci CPUInfo to be used for kernel configuration.
+ * @param[in] params M, N, K sizes.
+ *
+ * @return BlockSizes for a given strategy
+ */
+ virtual BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params ¶ms) = 0;
};
-#define DEFINE_STRATEGY_SUFFIX(strat, suffix) \
- using strategy = arm_gemm::strat; \
- static constexpr const char *name = #strat suffix;
+/** Interleaved Strategy class */
+template <typename StrategyType>
+class InterleavedStrategy : public IInterleavedStrategy
+{
+public:
+ using strategy = StrategyType;
-#define DEFINE_STRATEGY(strat) \
- DEFINE_STRATEGY_SUFFIX(strat, "")
+public:
+ // Inherited methods overridden
+ unsigned int out_height() const override
+ {
+ return strategy::out_height();
+ }
+ std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b,
+ ITensor *transformed_b,
+ const INEGEMMWrapperKernel::Params ¶ms,
+ const CPUInfo &ci) override
+ {
+ auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<strategy>>();
+ prepare_b->configure(b, transformed_b, false, ci, params);
+ return std::move(prepare_b);
+ }
+ std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a,
+ ITensor *transformed_a,
+ const Window &block_walker,
+ const INEGEMMWrapperKernel::Params ¶ms) override
+ {
+ auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<strategy>>();
+ transform_a->configure(a, transformed_a, false, block_walker, params);
+ return std::move(transform_a);
+ }
+ std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c,
+ const Window &block_walker, const BlockSizes &block_sizes,
+ const INEGEMMWrapperKernel::Params ¶ms, float alpha, float beta, bool pretranspose_b,
+ unsigned int num_threads) override
+ {
+ auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<strategy>>();
+ matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, num_threads);
+ return std::move(matrix_multiply);
+ }
-#ifdef __ARM_FEATURE_SVE
-template <>
-struct Kernel<float, false>
-{
- DEFINE_STRATEGY(interleaved_fp32_mla_3VLx8)
-};
-template <>
-struct Kernel<float16_t, false>
-{
- DEFINE_STRATEGY(interleaved_fp16_mla_3VLx8)
-};
-template <bool use_dot>
-struct Kernel<int8_t, use_dot>
-{
- DEFINE_STRATEGY(interleaved_s8s32_dot_3VLx8)
-};
-template <bool use_dot>
-struct Kernel<uint8_t, use_dot>
-{
- DEFINE_STRATEGY(interleaved_u8u32_dot_3VLx8)
-};
-#else /* __ARM_FEATURE_SVE */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-struct Kernel<float16_t, false>
-{
- DEFINE_STRATEGY(hgemm_24x8)
-};
-#endif /*__ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-#ifdef __aarch64__
-template <>
-struct Kernel<float, false>
-{
- DEFINE_STRATEGY(sgemm_12x8)
-};
-template <>
-struct Kernel<int8_t, false>
-{
- DEFINE_STRATEGY(gemm_s8_4x4)
-};
-template <>
-struct Kernel<uint8_t, false>
-{
- DEFINE_STRATEGY(gemm_u8_4x4)
+ BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params ¶ms) override
+ {
+ return calculate_block_sizes<strategy>(ci, params.M, params.N, params.K);
+ }
};
-//Use different strategies for 8bit dot product:
-template <>
-struct Kernel<int8_t, true>
+/** Create the backend GEMM strategy to use given the provided kernel info
+ *
+ * @param[in] kernel_name Kernel name of the backend strategy to instantiate
+ *
+ * @return The requested kernel strategy if exists else nullptr
+ */
+std::unique_ptr<IInterleavedStrategy> create_strategy(const std::string &kernel_name)
{
- DEFINE_STRATEGY_SUFFIX(gemm_s8_12x8, "_dot")
-};
-template <>
-struct Kernel<uint8_t, true>
-{
- DEFINE_STRATEGY_SUFFIX(gemm_u8_12x8, "_dot")
-};
-#else
-template <>
-struct Kernel<float, false>
-{
- DEFINE_STRATEGY(sgemm_8x6)
-};
-#endif /* __aarch64__ */
-#endif /* __ARM_FEATURE_SVE */
-
-#undef DEFINE_STRATEGY
-#undef DEFINE_STRATEGY_SUFFIX
-
-} // namespace
+#if defined(__arm__)
+ if(kernel_name.find("sgemm_8x6") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::sgemm_8x6>>();
+ }
+#endif // defined(__arm__)
+#if defined(__aarch64__)
+ if(kernel_name.find("gemm_s8_4x4") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_s8_4x4>>();
+ }
+ if(kernel_name.find("gemm_s8_12x8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_s8_12x8>>();
+ }
+ if(kernel_name.find("gemm_u8_4x4") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_u8_4x4>>();
+ }
+ if(kernel_name.find("gemm_u8_12x8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_u8_12x8>>();
+ }
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ if(kernel_name.find("hgemm_24x8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::hgemm_24x8>>();
+ }
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ if(kernel_name.find("sgemm_12x8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::sgemm_12x8>>();
+ }
+#if defined(__ARM_FEATURE_SVE)
+ if(kernel_name.find("interleaved_fp16_mla_3VLx8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_fp16_mla_3VLx8>>();
+ }
+ if(kernel_name.find("interleaved_fp32_mla_3VLx8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_fp32_mla_3VLx8>>();
+ }
+ if(kernel_name.find("interleaved_s8s32_dot_3VLx8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_s8s32_dot_3VLx8>>();
+ }
+ if(kernel_name.find("interleaved_u8u32_dot_3VLx8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_u8u32_dot_3VLx8>>();
+ }
+#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(__aarch64__)_
+ return nullptr;
+}
+} // namespace detail
} // namespace arm_compute
#endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDSTRATEGIES_H__ */
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
deleted file mode 100644
index 3b80a1f..0000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
-
-#include "NEGEMMInterleavedStrategies.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/WindowIterator.h"
-
-#include "utils/TypePrinter.h"
-
-namespace arm_compute
-{
-template <typename To, bool use_dot>
-void NEGEMMInterleavedTransformAWrapperTemplate<To, use_dot>::configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, const Window &block_walker,
- const INEGEMMWrapperKernel::Params ¶ms)
-{
- _a = a;
- _transformed_a = transformed_a;
- _transpose_a = transpose_a;
- _Ksize = params.K;
- _Msize = params.M;
- _k_multi_window = block_walker.shift_dimensions(1); // block_walker contains (M,K,Multi) --> shift by 1 to get rid of the "M" dimension
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedTransformAWrapperTemplate<To, use_dot>::transform(const TransformAWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset,
- const Coordinates &end_offset)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- strategy strat(info.cpu_info);
- TensorAccessor<To> a(*_a);
- TensorAccessor<To> transformed_a(*_transformed_a);
-
- if(_a->info()->data_layout() == DataLayout::NHWC)
- {
- // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is
- // the relevant multiple of the row stride.
- const size_t nhwc_batch_stride = _a->info()->strides_in_bytes().y() * _Msize;
- a.set_stride(2, nhwc_batch_stride);
- }
-
- unsigned int last_m = 0;
- //TODO: Create a new iterate_1D( DimY);
- int last_y = -1;
- auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
- {
- if(id.y() != last_y)
- {
- last_y = id.y();
- unsigned int batch = id.y();
- unsigned int first_m = id.x();
-
- if(first_m >= last_m)
- return;
-
- strat.transforms.PrepareA(transformed_a(0, first_m, batch),
- a(0, 0, batch, wl._multi),
- a.stride(1), first_m, last_m, wl._k0, wl._kmax, _transpose_a);
- }
- });
- auto on_new_row_size = [&](unsigned int start, unsigned int end)
- {
- last_m = std::min(end, _Msize);
- };
- window_iterator.iterate_2D(on_new_row_size);
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedTransformAWrapperTemplate<To, use_dot>::create_workloads(std::vector<TransformAWorkload> &workloads)
-{
- execute_window_loop(_k_multi_window, [&](const Coordinates & id)
- {
- const unsigned int k0 = id.x();
- const unsigned int multi = id.y();
- const unsigned int kmax = std::min(k0 + _k_multi_window.x().step(), _Ksize);
-
- workloads.push_back(TransformAWorkload(k0, kmax, multi));
- });
-}
-
-template class NEGEMMInterleavedTransformAWrapperTemplate<float>;
-#ifdef __aarch64__
-template class NEGEMMInterleavedTransformAWrapperTemplate<uint8_t>;
-template class NEGEMMInterleavedTransformAWrapperTemplate<int8_t>;
-template class NEGEMMInterleavedTransformAWrapperTemplate<uint8_t, true>;
-template class NEGEMMInterleavedTransformAWrapperTemplate<int8_t, true>;
-#endif /* __aarch64__ */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEGEMMInterleavedTransformAWrapperTemplate<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
index e452dfb..7b1f3e7 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,11 +34,7 @@
#include "../arm_gemm/mergeresults.hpp"
#include "../arm_gemm/transform.hpp"
-#include "../arm_gemm/kernels/a32_sgemm_8x6.hpp"
-#include "../arm_gemm/kernels/a64_sgemm_12x8.hpp"
#include "../arm_gemm/kernels/a64_sgemm_native_16x4.hpp"
-#include "../arm_gemm/kernels/a64_sgemv_pretransposed.hpp"
-#include "../arm_gemm/kernels/a64_sgemv_trans.hpp"
namespace arm_compute
{
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 39dad8f..73eaf64 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,8 +22,9 @@
* SOFTWARE.
*/
-#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
#include "support/ToolchainSupport.h"
#include <algorithm>
@@ -48,8 +49,10 @@
std::string out;
std::ifstream fs;
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
try
{
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
std::ios_base::openmode mode = std::ios::in;
@@ -68,11 +71,13 @@
fs.seekg(0, std::ios::beg);
// Copy the content of the file
out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>());
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
catch(const std::ifstream::failure &e)
{
ARM_COMPUTE_ERROR("Accessing %s: %s", filename.c_str(), e.what());
}
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
return out;
}
@@ -321,17 +326,19 @@
return res;
}
-PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
+PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout)
{
- const auto &strides = conv_info.stride();
- const int out_width = std::ceil(float(input_shape.x()) / float(strides.first));
- const int out_height = std::ceil(float(input_shape.y()) / float(strides.second));
- const int pad_width = ((out_width - 1) * strides.first + weights_shape.x() - input_shape.x());
- const int pad_height = ((out_height - 1) * strides.second + weights_shape.y() - input_shape.y());
- const int same_pad_left = pad_width / 2;
- const int same_pad_top = pad_height / 2;
- const int same_pad_right = pad_width - same_pad_left;
- const int same_pad_bottom = pad_height - same_pad_top;
+ const unsigned int width_idx = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const auto &strides = conv_info.stride();
+ const int out_width = std::ceil(float(input_shape[width_idx]) / float(strides.first));
+ const int out_height = std::ceil(float(input_shape[height_idx]) / float(strides.second));
+ const int pad_width = ((out_width - 1) * strides.first + weights_shape[width_idx] - input_shape[width_idx]);
+ const int pad_height = ((out_height - 1) * strides.second + weights_shape[height_idx] - input_shape[height_idx]);
+ const int same_pad_left = pad_width / 2;
+ const int same_pad_top = pad_height / 2;
+ const int same_pad_right = pad_width - same_pad_left;
+ const int same_pad_bottom = pad_height - same_pad_top;
return PadStrideInfo(strides.first, strides.second, same_pad_left, same_pad_right, same_pad_top, same_pad_bottom, DimensionRoundingType::CEIL);
}
@@ -391,6 +398,7 @@
return std::make_pair(w, h);
}
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
{
switch(dt)
@@ -451,3 +459,4 @@
}
return 0;
}
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index a4bce5d..08803c7 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -23,13 +23,143 @@
*/
#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/helpers/bit_ops.h"
+
namespace arm_compute
{
namespace helpers
{
namespace tensor_transform
{
-Coordinates slice_absolute_end_coords(TensorShape input_shape, Coordinates ends)
+int calculate_stride_on_index(int index, Coordinates strides)
+{
+ return index >= static_cast<int>(strides.num_dimensions()) ? 1 : strides[index];
+}
+
+int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
+{
+ // Early exit
+ if(index >= static_cast<int>(starts.num_dimensions()))
+ {
+ return 0;
+ }
+
+ // Get stride
+ const int stride = calculate_stride_on_index(index, strides);
+
+ // Calculate start
+ int start = starts[index];
+
+ // Reset in case of begin mask present
+ if(arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
+ {
+ start = stride > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
+ }
+
+ // Account negative start points
+ const int dim_size = input_shape[index];
+ if(start < 0)
+ {
+ start += dim_size;
+ }
+
+ // Final clamp
+ start = utility::clamp(start, 0, dim_size - 1);
+
+ return start;
+}
+
+int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index,
+ Coordinates ends, Coordinates strides,
+ int32_t end_mask, int32_t shrink_axis_mask)
+{
+ // Early exit
+ if(index >= static_cast<int>(ends.num_dimensions()))
+ {
+ return input_shape[index];
+ }
+
+ const int stride = calculate_stride_on_index(index, strides);
+ const bool shrink_axis = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, index);
+
+ // Calculate start
+ int stop = ends[index];
+
+ // Shrink dimension
+ if(shrink_axis)
+ {
+ stop = start_on_index + 1;
+ }
+
+ // Reset in case of begin mask present
+ if(arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
+ {
+ stop = (stride > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
+ }
+
+ // Account negative end points
+ const int dim_size = input_shape[index];
+ if(stop < 0)
+ {
+ stop += dim_size;
+ }
+
+ // Final clamp
+ stop = (stride > 0) ? utility::clamp(stop, 0, dim_size) : utility::clamp(stop, -1, dim_size - 1);
+
+ return stop;
+}
+
+std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
+ Coordinates starts, Coordinates ends, Coordinates strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ Coordinates starts_abs, ends_abs, final_strides;
+ for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ {
+ const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
+ starts_abs.set(i, start_i);
+ ends_abs.set(i, calculate_end_on_index(input_shape, i, start_i, ends, strides, end_mask, shrink_axis_mask));
+ final_strides.set(i, calculate_stride_on_index(i, strides));
+ }
+
+ return std::make_tuple(starts_abs, ends_abs, final_strides);
+}
+
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask, bool return_unshrinked)
+{
+ unsigned int index = 0;
+
+ TensorShape output_shape;
+ for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ {
+ const int stride = calculate_stride_on_index(index, strides);
+ const int start = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
+ const int end = calculate_end_on_index(input_shape, i, start, ends, strides, end_mask, shrink_axis_mask);
+ const int range = end - start;
+
+ const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
+ if(return_unshrinked || !is_shrink)
+ {
+ if((range == 0) || // Zero range
+ (range < 0 && stride >= 0) || // Negative range with positive stride
+ (range > 0 && stride <= 0)) // Positive range with negative stride
+ {
+ output_shape.set(index, 0);
+ return output_shape;
+ }
+ else
+ {
+ int dim = range / stride + (range % stride != 0 ? 1 : 0);
+ output_shape.set(index++, dim);
+ }
+ }
+ }
+ return output_shape;
+}
+
+int32_t construct_slice_end_mask(Coordinates ends)
{
// Create end mask
int32_t end_mask = 0;
@@ -40,126 +170,8 @@
end_mask |= 1 << i;
}
}
- // Get unit strides
- const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides());
- return strided_slice_absolute_end_coords(input_shape, Coordinates(), ends, unit_strides, end_mask);
-}
-
-TensorShape compute_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends_abs)
-{
- // Get unit strides
- const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides());
- return compute_strided_slice_output_shape(input_shape, starts, ends_abs, unit_strides);
-}
-
-Coordinates strided_slice_absolute_start_coords(TensorShape input_shape, Coordinates starts, Coordinates strides, int32_t begin_mask)
-{
- Coordinates starts_abs;
- for(unsigned int i = 0; i < starts.num_dimensions(); ++i)
- {
- // Get start index
- int start_i = starts[i];
-
- // Reset in case of begin mask present
- if((begin_mask & 1 << i) != 0)
- {
- start_i = strides[i] > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
- }
-
- // Account negative start points
- const int dim_size = input_shape[i];
- if(start_i < 0)
- {
- start_i += dim_size;
- }
-
- // Final clamp
- start_i = utility::clamp(start_i, 0, dim_size - 1);
- starts_abs.set(i, start_i);
- }
-
- // Fill remaining
- for(unsigned int i = starts_abs.num_dimensions(); i < input_shape.num_dimensions(); ++i)
- {
- starts_abs.set(i, 0);
- }
-
- return starts_abs;
-}
-
-Coordinates strided_slice_absolute_end_coords(TensorShape input_shape, Coordinates starts_abs, Coordinates ends, Coordinates strides,
- int32_t end_mask, int32_t shrink_axis_mask)
-{
- Coordinates ends_abs;
- for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
- {
- // Get end index
- int stop_i = ends[i];
-
- // Shrink dimension
- if((shrink_axis_mask & (1 << i)) != 0)
- {
- stop_i = starts_abs[i] + 1;
- }
-
- // Reset in case of begin mask present
- if((end_mask & 1 << i) != 0)
- {
- stop_i = (strides[i] > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
- }
-
- // Account negative end points
- const int dim_size = input_shape[i];
- if(stop_i < 0)
- {
- stop_i += dim_size;
- }
-
- // Final clamp
- stop_i = (strides[i] > 0) ? utility::clamp(stop_i, 0, dim_size) : utility::clamp(stop_i, -1, dim_size - 1);
- ends_abs.set(i, stop_i);
- }
-
- // Fill remaining ends
- for(unsigned int i = ends_abs.num_dimensions(); i < input_shape.num_dimensions(); ++i)
- {
- ends_abs.set(i, input_shape[i]);
- }
-
- return ends_abs;
-}
-
-Coordinates strided_slice_strides(TensorShape input_shape, Coordinates strides)
-{
- for(unsigned int i = strides.num_dimensions(); i < input_shape.num_dimensions(); ++i)
- {
- strides.set(i, 1);
- }
- return strides;
-}
-
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts_abs, Coordinates ends_abs, Coordinates final_strides)
-{
- TensorShape output_shape = input_shape;
- for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
- {
- const int stride_i = final_strides[i];
- const int range = ends_abs[i] - starts_abs[i];
- if((range == 0) || // Zero range
- (range < 0 && stride_i >= 0) || // Negative range with positive stride
- (range > 0 && stride_i <= 0)) // Positive range with negative stride
- {
- output_shape.set(i, 0);
- return output_shape;
- }
- else
- {
- int dim = range / stride_i + (range % stride_i != 0 ? 1 : 0);
- output_shape.set(i, dim);
- }
- }
- return output_shape;
+ return end_mask;
}
} // namespace tensor_transform
} // namespace helpers
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index b2ca28d..a944d2c 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -310,8 +310,8 @@
return nid;
}
-NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend, PadStrideInfo conv_info,
- DepthwiseConvolutionMethod method,
+NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend,
+ PadStrideInfo conv_info, int depth_multiplier, DepthwiseConvolutionMethod method,
ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo quant_info)
{
CHECK_NODEIDX_PAIR(input, g);
@@ -327,7 +327,7 @@
w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
- get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+ get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
if(!quant_info.empty())
{
w_desc.quant_info = quant_info;
@@ -340,7 +340,7 @@
if(has_bias)
{
TensorDescriptor b_desc = input_tensor_desc;
- b_desc.shape = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+ b_desc.shape = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
if(is_data_type_quantized_asymmetric(b_desc.data_type))
{
@@ -351,7 +351,7 @@
}
// Create convolution node and connect
- NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, method);
+ NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, depth_multiplier, method);
g.add_connection(input.node_id, input.index, conv_nid, 0);
g.add_connection(w_nid, 0, conv_nid, 1);
if(has_bias)
@@ -362,6 +362,22 @@
return conv_nid;
}
+NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, DetectionOutputLayerInfo detect_info)
+{
+ CHECK_NODEIDX_PAIR(input_loc, g);
+ CHECK_NODEIDX_PAIR(input_conf, g);
+ CHECK_NODEIDX_PAIR(input_priorbox, g);
+
+ // Create detection_output node and connect
+ NodeID detect_nid = g.add_node<DetectionOutputLayerNode>(detect_info);
+ g.add_connection(input_loc.node_id, input_loc.index, detect_nid, 0);
+ g.add_connection(input_conf.node_id, input_conf.index, detect_nid, 1);
+ g.add_connection(input_priorbox.node_id, input_priorbox.index, detect_nid, 2);
+
+ set_node_params(g, detect_nid, params);
+
+ return detect_nid;
+}
NodeID GraphBuilder::add_dummy_node(Graph &g, NodeParams params, NodeIdxPair input, TensorShape shape)
{
diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp
index 30a3546..e0ba7e2 100644
--- a/src/graph/TypeLoader.cpp
+++ b/src/graph/TypeLoader.cpp
@@ -38,14 +38,19 @@
{ "qasymm8", DataType::QASYMM8 },
};
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
try
{
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
return data_types.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
catch(const std::out_of_range &)
{
throw std::invalid_argument(name);
}
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
}
arm_compute::DataLayout data_layout_from_name(const std::string &name)
@@ -56,14 +61,19 @@
{ "nchw", DataLayout::NCHW },
};
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
try
{
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
return data_layouts.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
catch(const std::out_of_range &)
{
throw std::invalid_argument(name);
}
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
}
namespace graph
{
@@ -73,17 +83,22 @@
{
{ "neon", Target::NEON },
{ "cl", Target::CL },
- { "gles", Target::GC },
+ { "gc", Target::GC },
};
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
try
{
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
return targets.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
catch(const std::out_of_range &)
{
throw std::invalid_argument(name);
}
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
}
} // namespace graph
} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index c37a137..b9e3ddc 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/graph/Graph.h"
#include "arm_compute/graph/backends/FunctionHelpers.h"
#include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
using namespace arm_compute::utils::cast;
@@ -68,6 +69,94 @@
using Subtraction = CLArithmeticSubtraction;
using Multiplication = CLPixelWiseMultiplication;
};
+// TODO (isagot01): Remove once we support heterogeneous scheduling at function level
+/** Wrapper for the CPP Function in the OpenCL backend **/
+class CPPWrapperFunction : public IFunction
+{
+public:
+ /* Default constructor */
+ CPPWrapperFunction()
+ : _tensors(), _func(nullptr)
+ {
+ }
+
+ void run() override
+ {
+ for(auto &tensor : _tensors)
+ {
+ tensor->map(CLScheduler::get().queue());
+ }
+ _func->run();
+
+ for(auto &tensor : _tensors)
+ {
+ tensor->unmap(CLScheduler::get().queue());
+ }
+ }
+
+ void register_tensor(ICLTensor *tensor)
+ {
+ _tensors.push_back(tensor);
+ }
+
+ void register_function(std::unique_ptr<IFunction> function)
+ {
+ _func = std::move(function);
+ }
+
+private:
+ std::vector<arm_compute::ICLTensor *> _tensors;
+ std::unique_ptr<IFunction> _func;
+};
+
+namespace detail
+{
+// Specialized functions
+template <>
+std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(DetectionOutputLayerNode &node)
+{
+ validate_node<CLTargetInfo>(node, 3 /* expected inputs */, 1 /* expected outputs */);
+
+ // Extract IO and info
+ CLTargetInfo::TensorType *input0 = get_backing_tensor<CLTargetInfo>(node.input(0));
+ CLTargetInfo::TensorType *input1 = get_backing_tensor<CLTargetInfo>(node.input(1));
+ CLTargetInfo::TensorType *input2 = get_backing_tensor<CLTargetInfo>(node.input(2));
+ CLTargetInfo::TensorType *output = get_backing_tensor<CLTargetInfo>(node.output(0));
+ const DetectionOutputLayerInfo detect_info = node.detection_output_info();
+
+ ARM_COMPUTE_ERROR_ON(input0 == nullptr);
+ ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+ ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ // Create and configure function
+ auto func = support::cpp14::make_unique<CPPDetectionOutputLayer>();
+ func->configure(input0, input1, input2, output, detect_info);
+
+ // Log info
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+ << node.name()
+ << " Type: " << node.type()
+ << " Target: " << CLTargetInfo::TargetType
+ << " Data Type: " << input0->info()->data_type()
+ << " Input0 shape: " << input0->info()->tensor_shape()
+ << " Input1 shape: " << input1->info()->tensor_shape()
+ << " Input2 shape: " << input2->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " DetectionOutputLayer info: " << detect_info
+ << std::endl);
+
+ auto wrap_function = support::cpp14::make_unique<CPPWrapperFunction>();
+ ;
+ wrap_function->register_function(std::move(func));
+ wrap_function->register_tensor(input0);
+ wrap_function->register_tensor(input1);
+ wrap_function->register_tensor(input2);
+ wrap_function->register_tensor(output);
+
+ return std::move(wrap_function);
+}
+} // namespace detail
std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &ctx)
{
@@ -95,6 +184,8 @@
return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
case NodeType::DepthwiseConvolutionLayer:
return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayerFunctions, CLTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::DetectionOutputLayer:
+ return detail::create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
case NodeType::EltwiseLayer:
return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
case NodeType::FlattenLayer:
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index a070973..4b71837 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,7 @@
#include "arm_compute/core/utils/misc/Cast.h"
#include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
using namespace arm_compute::utils::cast;
@@ -59,6 +60,8 @@
case NodeType::DepthwiseConvolutionLayer:
return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer,
CLDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::DetectionOutputLayer:
+ return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
case NodeType::GenerateProposalsLayer:
return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node));
case NodeType::NormalizePlanarYUVLayer:
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index 2ca453e..0de58f5 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -176,8 +176,8 @@
const PadStrideInfo conv_info = node.convolution_info();
const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
- const unsigned int depth_multiplier = 1;
const ActivationLayerInfo fused_act = node.fused_activation();
+ const int depth_multiplier = node.depth_multiplier();
// Create and configure function (we assume that functions have been validated before creation)
std::unique_ptr<IFunction> func;
@@ -204,6 +204,7 @@
<< " Input shape: " << input->info()->tensor_shape()
<< " Weights shape: " << weights->info()->tensor_shape()
<< " Output shape: " << output->info()->tensor_shape()
+ << " Depth multiplier: " << depth_multiplier
<< (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
<< std::endl);
return func;
diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
index fe69c7a..f15ede6 100644
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,6 +111,8 @@
return validate_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node));
case NodeType::DepthwiseConvolutionLayer:
return validate_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::DetectionOutputLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : DetectionOutputLayer");
case NodeType::FlattenLayer:
return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : FlattenLayer");
case NodeType::GenerateProposalsLayer:
diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
index 23ced2f..f94cd97 100644
--- a/src/graph/backends/NEON/NEDeviceBackend.cpp
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index ca8d485..dc987dd 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/graph/backends/FunctionHelpers.h"
#include "arm_compute/graph/backends/Utils.h"
#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "support/ToolchainSupport.h"
@@ -77,7 +78,7 @@
namespace detail
{
-// Specialize functions
+// Specialized functions
template <>
std::unique_ptr<IFunction> create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(ConvolutionLayerNode &node,
GraphContext &ctx)
@@ -201,6 +202,8 @@
return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
case NodeType::DepthwiseConvolutionLayer:
return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::DetectionOutputLayer:
+ return detail::create_detection_output_layer<CPPDetectionOutputLayer, NETargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
case NodeType::EltwiseLayer:
return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
case NodeType::FlattenLayer:
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index a2abc83..b0feec5 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/graph/nodes/Nodes.h"
#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
using namespace arm_compute::utils::cast;
@@ -59,6 +60,8 @@
case NodeType::DepthwiseConvolutionLayer:
return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer,
NEDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::DetectionOutputLayer:
+ return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
case NodeType::GenerateProposalsLayer:
return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
case NodeType::NormalizePlanarYUVLayer:
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index f2c381b..767154b 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -135,6 +135,9 @@
workload.graph = &g;
workload.ctx = &ctx;
+ // Reserve memory for tasks
+ workload.tasks.reserve(node_order.size());
+
// Create tasks
for(auto &node_id : node_order)
{
@@ -146,10 +149,7 @@
std::unique_ptr<IFunction> func = backend.configure_node(*node, ctx);
if(func != nullptr)
{
- ExecutionTask task;
- task.task = std::move(func);
- task.node = node;
- workload.tasks.push_back(std::move(task));
+ workload.tasks.emplace_back(ExecutionTask(std::move(func), node));
}
}
}
diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 02d1632..75ca5f4 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
@@ -32,13 +32,18 @@
{
namespace graph
{
-DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, DepthwiseConvolutionMethod method)
- : _info(std::move(info)), _method(method), _fused_activation()
+DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method)
+ : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _fused_activation()
{
_input_edges.resize(3, EmptyEdgeID);
_outputs.resize(1, NullTensorID);
}
+int DepthwiseConvolutionLayerNode::depth_multiplier() const
+{
+ return _depth_multiplier;
+}
+
void DepthwiseConvolutionLayerNode::set_depthwise_convolution_method(DepthwiseConvolutionMethod method)
{
_method = method;
@@ -66,21 +71,24 @@
TensorDescriptor DepthwiseConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
const TensorDescriptor &weights_descriptor,
- const PadStrideInfo &info)
+ const PadStrideInfo &info,
+ int depth_multiplier)
{
unsigned int output_width = 0;
unsigned int output_height = 0;
- const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
- const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
- const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
+ const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+ const unsigned int input_channels = get_dimension_size(input_descriptor, DataLayoutDimension::CHANNEL);
+ const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
TensorDescriptor output_descriptor = input_descriptor;
output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
return output_descriptor;
}
@@ -105,7 +113,7 @@
ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
- return compute_output_descriptor(src->desc(), weights->desc(), _info);
+ return compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier);
}
NodeType DepthwiseConvolutionLayerNode::type() const
diff --git a/src/graph/nodes/DetectionOutputLayerNode.cpp b/src/graph/nodes/DetectionOutputLayerNode.cpp
new file mode 100644
index 0000000..c2d9f24
--- /dev/null
+++ b/src/graph/nodes/DetectionOutputLayerNode.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DetectionOutputLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info)
+ : _info(detection_info)
+{
+ _input_edges.resize(3, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+DetectionOutputLayerInfo DetectionOutputLayerNode::detection_output_info() const
+{
+ return _info;
+}
+
+TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ const DetectionOutputLayerInfo &info)
+{
+ const unsigned int max_size = info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1);
+
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape.set(0, detection_size);
+ output_descriptor.shape.set(1, max_size);
+
+ return output_descriptor;
+}
+
+bool DetectionOutputLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor DetectionOutputLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *input0 = input(0);
+ ARM_COMPUTE_ERROR_ON(input0 == nullptr);
+
+ return compute_output_descriptor(input0->desc(), _info);
+}
+
+NodeType DetectionOutputLayerNode::type() const
+{
+ return NodeType::DetectionOutputLayer;
+}
+
+void DetectionOutputLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/GenerateProposalsLayerNode.cpp b/src/graph/nodes/GenerateProposalsLayerNode.cpp
index 7367e80..dabfc5a 100644
--- a/src/graph/nodes/GenerateProposalsLayerNode.cpp
+++ b/src/graph/nodes/GenerateProposalsLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/graph/nodes/SliceLayerNode.cpp b/src/graph/nodes/SliceLayerNode.cpp
index 3a29e4c..bfc009d 100644
--- a/src/graph/nodes/SliceLayerNode.cpp
+++ b/src/graph/nodes/SliceLayerNode.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/graph/nodes/SliceLayerNode.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/graph/Graph.h"
#include "arm_compute/graph/INodeVisitor.h"
@@ -52,16 +52,12 @@
TensorDescriptor SliceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
const Coordinates &starts, const Coordinates &ends)
{
- // Get absolute end coordinates
- const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input_descriptor.shape, ends);
+ using namespace arm_compute::helpers::tensor_transform;
- TensorDescriptor output_descriptor = input_descriptor;
- for(unsigned int i = 0; i < starts.num_dimensions(); ++i)
- {
- output_descriptor.shape.set(i, ends_abs[i] - starts[i]);
- }
+ TensorDescriptor output_desc = input_descriptor;
+ output_desc.shape = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends);
- return output_descriptor;
+ return output_desc;
}
bool SliceLayerNode::forward_descriptors()
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index 7f0e374..d9de11e 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp
@@ -44,6 +44,5 @@
std::unique_ptr<IMemoryRegion> Allocator::make_region(size_t size, size_t alignment)
{
- ARM_COMPUTE_UNUSED(alignment);
- return arm_compute::support::cpp14::make_unique<MemoryRegion>(size);
+ return arm_compute::support::cpp14::make_unique<MemoryRegion>(size, alignment);
}
\ No newline at end of file
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 2a4ab6e..c5d42b1 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,7 +32,6 @@
#include <algorithm>
#include <cmath>
#include <map>
-#include <vector>
using namespace arm_compute;
@@ -62,19 +61,21 @@
{
return ba.max_size > bb.max_size;
});
- std::vector<size_t> group_sizes;
+
+ // Create group sizes vector
+ std::vector<BlobInfo> group_sizes;
std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
{
- return b.max_size;
+ return BlobInfo(b.max_size, b.max_alignment);
});
// Update blob sizes
size_t max_size = std::max(_blobs.size(), group_sizes.size());
- _blobs.resize(max_size, 0);
- group_sizes.resize(max_size, 0);
- std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](size_t lhs, size_t rhs)
+ _blobs.resize(max_size);
+ group_sizes.resize(max_size);
+ std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
{
- return std::max(lhs, rhs);
+ return BlobInfo(std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment));
});
// Calculate group mappings
diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index e09451c..812cbdd 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp
@@ -33,11 +33,11 @@
using namespace arm_compute;
-BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<size_t> blob_sizes)
- : _allocator(allocator), _blobs(), _blob_sizes(std::move(blob_sizes))
+BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<BlobInfo> blob_info)
+ : _allocator(allocator), _blobs(), _blob_info(std::move(blob_info))
{
ARM_COMPUTE_ERROR_ON(!allocator);
- allocate_blobs(_blob_sizes);
+ allocate_blobs(_blob_info);
}
BlobMemoryPool::~BlobMemoryPool()
@@ -73,16 +73,16 @@
std::unique_ptr<IMemoryPool> BlobMemoryPool::duplicate()
{
ARM_COMPUTE_ERROR_ON(!_allocator);
- return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_sizes);
+ return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_info);
}
-void BlobMemoryPool::allocate_blobs(const std::vector<size_t> &sizes)
+void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info)
{
ARM_COMPUTE_ERROR_ON(!_allocator);
- for(const auto &size : sizes)
+ for(const auto &bi : blob_info)
{
- _blobs.push_back(_allocator->make_region(size, 0));
+ _blobs.push_back(_allocator->make_region(bi.size, bi.alignment));
}
}
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
new file mode 100644
index 0000000..533e6fa
--- /dev/null
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+
+namespace
+{
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
+void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
+{
+ printf("%.*s", len, buffer);
+}
+#endif /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
+
+/** This initialises the properties vector with the configuration to be used when creating the opencl context
+ *
+ * @param[in] platform The opencl platform used to create the context
+ * @param[in] device The opencl device to be used to create the context
+ * @param[in] prop An array of properties to be initialised
+ *
+ * @note In debug builds, this function will enable cl_arm_printf if it's supported.
+ *
+ * @return A pointer to the context properties which can be used to create an opencl context
+ */
+
+void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, cl_context_properties prop[7])
+{
+ ARM_COMPUTE_UNUSED(device);
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
+ // Query devices in the context for cl_arm_printf support
+ if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+ {
+ // Create a cl_context with a printf_callback and user specified buffer size.
+ cl_context_properties properties_printf[] =
+ {
+ CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
+ // Enable a printf callback function for this context.
+ CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
+ // Request a minimum printf buffer size of 4MB for devices in the
+ // context that support this extension.
+ CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
+ 0
+ };
+ std::copy_n(properties_printf, 7, prop);
+ }
+ else
+#endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
+ {
+ cl_context_properties properties[] =
+ {
+ CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
+ 0
+ };
+ std::copy_n(properties, 3, prop);
+ };
+}
+} //namespace
+
+namespace arm_compute
+{
+std::tuple<cl::Context, cl::Device, cl_int>
+create_opencl_context_and_device()
+{
+ ARM_COMPUTE_ERROR_ON(!opencl_is_available());
+ std::vector<cl::Platform> platforms;
+ cl::Platform::get(&platforms);
+ ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
+ cl::Platform p = platforms[0];
+ cl::Device device;
+ std::vector<cl::Device> platform_devices;
+ p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
+ ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
+ device = platform_devices[0];
+ cl_int err = CL_SUCCESS;
+ cl_context_properties properties[7] = { 0, 0, 0, 0, 0, 0, 0 };
+ initialise_context_properties(p, device, properties);
+ cl::Context cl_context = cl::Context(device, properties, nullptr, nullptr, &err);
+ ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
+ return std::make_tuple(cl_context, device, err);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index a311c6f..701ffe0 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,22 +23,14 @@
*/
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
+
#include "arm_compute/core/CL/ICLKernel.h"
#include "arm_compute/runtime/CL/CLTuner.h"
#include "arm_compute/runtime/CL/tuners/Tuners.h"
using namespace arm_compute;
-namespace
-{
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
-{
- printf("%.*s", len, buffer);
-}
-#endif /* defined(ARM_COMPUTE_DEBUG_ENABLED) */
-} // namespace
-
std::once_flag CLScheduler::_initialize_symbols;
CLScheduler::CLScheduler()
@@ -53,53 +45,30 @@
return scheduler;
}
+void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner)
+{
+ if(!_is_initialised)
+ {
+ cl::CommandQueue queue = cl::CommandQueue(ctx, device);
+ CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
+ init(ctx, queue, device, cl_tuner);
+ _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
+ _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
+ }
+}
+
void CLScheduler::default_init(ICLTuner *cl_tuner)
{
if(!_is_initialised)
{
- std::vector<cl::Platform> platforms;
- cl::Platform::get(&platforms);
- ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
- cl::Platform p = platforms[0];
- cl::Context ctx;
- cl::Device device;
- std::vector<cl::Device> platform_devices;
- p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
- ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
- device = platform_devices[0];
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-
- // Query devices in the context for cl_arm_printf support
- if(device_supports_extension(device, "cl_arm_printf"))
- {
- // Create a cl_context with a printf_callback and user specified buffer size.
- cl_context_properties properties[] =
- {
- CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
- // Enable a printf callback function for this context.
- CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
- // Request a minimum printf buffer size of 4MB for devices in the
- // context that support this extension.
- CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
- 0
- };
- ctx = cl::Context(device, properties);
- }
- else
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
- {
- cl_context_properties properties[] =
- {
- CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
- 0
- };
- ctx = cl::Context(device, properties);
- };
-
- cl::CommandQueue queue = cl::CommandQueue(ctx, device);
- CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
- init(ctx, queue, device, cl_tuner);
-
+ cl::Context ctx;
+ cl::Device dev;
+ cl_int err;
+ std::tie(ctx, dev, err) = create_opencl_context_and_device();
+ ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
+ cl::CommandQueue queue = cl::CommandQueue(ctx, dev);
+ CLKernelLibrary::get().init("./cl_kernels/", ctx, dev);
+ init(ctx, queue, dev, cl_tuner);
// Create a default static tuner and set if none was provided
_cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
}
@@ -108,6 +77,21 @@
_cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
}
+void CLScheduler::set_context(cl::Context context)
+{
+ _context = std::move(context);
+ CLKernelLibrary::get().set_context(_context);
+}
+
+void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner)
+{
+ set_context(std::move(context));
+ _queue = std::move(queue);
+ _target = get_target_from_device(device);
+ _is_initialised = true;
+ _cl_tuner = cl_tuner;
+}
+
void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
{
ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 5f82cd3..a262d6b 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,10 +33,40 @@
#include <limits>
#include <string>
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+/** Utility function used to initialize the LWS values to test.
+ * Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
+ *
+ * @param[in, out] lws Vector of LWS to test for a specific dimension
+ * @param[in] gws Size of the GWS
+ * @param[in] lws_max Max LKWS value allowed to be tested
+ * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
+ */
+void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+{
+ lws.push_back(1);
+
+ for(unsigned int i = 2; i <= lws_max; ++i)
+ {
+ // Power of two condition
+ const bool is_power_of_two = (i & (i - 1)) == 0;
+
+ // Condition for the module accordingly with the mod_let_one flag
+ const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+ if(mod_cond || is_power_of_two)
+ {
+ lws.push_back(i);
+ }
+ }
+}
+} // namespace
CLTuner::CLTuner(bool tune_new_kernels)
- : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+ : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
{
}
@@ -102,32 +132,35 @@
cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
{
+ // Profiling queue
+ cl::CommandQueue queue_profiler;
+
+ // Extract real OpenCL function to intercept
if(real_clEnqueueNDRangeKernel == nullptr)
{
real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-
- // Get the default queue
- _queue = CLScheduler::get().queue();
-
- // Check if we can use the OpenCL timer with the default queue
- cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>();
-
- if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
- {
- // Set the queue for profiling
- _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
- }
- else
- {
- _queue_profiler = _queue;
- }
}
+
+ // Get the default queue
+ cl::CommandQueue default_queue = CLScheduler::get().queue();
+
+ // Check if we can use the OpenCL timer with the default queue
+ cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
+
+ if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+ {
+ // Set the queue for profiling
+ queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
+ }
+ else
+ {
+ queue_profiler = default_queue;
+ }
+
// Start intercepting enqueues:
auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
const cl_event * event_wait_list, cl_event * event)
{
- ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
- ARM_COMPUTE_UNUSED(event);
if(this->kernel_event_is_set())
{
// If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
@@ -139,49 +172,45 @@
// Set OpenCL event
this->set_cl_kernel_event(tmp);
+ if(event != nullptr)
+ {
+ //return cl_event from the intercepted call
+ clRetainEvent(tmp);
+ *event = tmp;
+ }
return retval;
};
CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
+ cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
cl::NDRange opt_lws = cl::NullRange;
- const int x_step = std::max(1, kernel.window().x().step());
- const int y_step = std::max(1, kernel.window().y().step());
- const int z_step = std::max(1, kernel.window().z().step());
- const int x_end = kernel.window().x().end() - kernel.window().x().start() / x_step > 1 ? 16 : 1;
- const int y_end = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1;
- const int z_end = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1;
+ const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
+ const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
+ const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
- // First run using the default LWS
+ std::vector<unsigned int> lws_x;
+ std::vector<unsigned int> lws_y;
+ std::vector<unsigned int> lws_z;
+
+ // Initialize the LWS values to test
+ initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
+ initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
+ initialize_lws_values(lws_z, gws[2], lws_z_max, false);
+
+ for(const auto &z : lws_z)
{
- cl::NDRange lws_test = cl::NullRange;
-
- kernel.set_lws_hint(lws_test);
-
- // Run the kernel
- kernel.run(kernel.window(), _queue_profiler);
-
- _queue_profiler.finish();
-
- const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
- const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
- const cl_ulong diff = end - start;
- _kernel_event = nullptr;
-
- min_exec_time = diff;
- }
-
- for(int z = 1; z <= z_end; ++z)
- {
- for(int y = 1; y <= y_end; ++y)
+ for(const auto &y : lws_y)
{
- for(int x = 1; x <= x_end; ++x)
+ for(const auto &x : lws_x)
{
cl::NDRange lws_test = cl::NDRange(x, y, z);
- const bool invalid_lws = (x * y * z > static_cast<int>(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1);
+ bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+
+ invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
if(invalid_lws)
{
@@ -192,9 +221,9 @@
kernel.set_lws_hint(lws_test);
// Run the kernel
- kernel.run(kernel.window(), _queue_profiler);
+ kernel.run(kernel.window(), queue_profiler);
- _queue_profiler.finish();
+ queue_profiler.finish();
const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
@@ -278,3 +307,4 @@
}
fs.close();
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
new file mode 100644
index 0000000..a6393c5
--- /dev/null
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel>();
+ k->configure(input, output, axis, op);
+ _kernel = std::move(k);
+}
+
+Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+ return CLReductionOperationKernel::validate(input, output, axis, op);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
deleted file mode 100644
index 0b05058..0000000
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
- k->configure(input1, input2, output, policy);
- _kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
-
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
- return CLArithmeticAdditionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/CL/functions/CLArithmeticDivision.cpp b/src/runtime/CL/functions/CLArithmeticDivision.cpp
deleted file mode 100644
index 1c2849c..0000000
--- a/src/runtime/CL/functions/CLArithmeticDivision.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticDivision.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLArithmeticDivisionKernel>();
- k->configure(input1, input2, output);
- _kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
-
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
- return CLArithmeticDivisionKernel::validate(input1, input2, output);
-}
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
deleted file mode 100644
index e661f6a..0000000
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
- k->configure(input1, input2, output, policy);
- _kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
-
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
- return CLArithmeticSubtractionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 0000000..e0ffcdb
--- /dev/null
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
+ k->configure(input, output, policy, 0);
+ _kernel = std::move(k);
+}
+
+Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
+{
+ return CLDepthConvertLayerKernel::validate(input, output, policy, 0);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
new file mode 100644
index 0000000..86c9c31
--- /dev/null
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLComparison.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+ k->configure(input1, input2, output, operation);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+{
+ return CLComparisonKernel::validate(input1, input2, output, operation);
+}
+
+template <ComparisonOperation COP>
+void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+ k->configure(input1, input2, output, COP);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+template <ComparisonOperation COP>
+Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLComparisonKernel::validate(input1, input2, output, COP);
+}
+
+// Supported Specializations
+template class CLComparisonStatic<ComparisonOperation::Equal>;
+template class CLComparisonStatic<ComparisonOperation::NotEqual>;
+template class CLComparisonStatic<ComparisonOperation::Greater>;
+template class CLComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class CLComparisonStatic<ComparisonOperation::Less>;
+template class CLComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index 409d3c9..24c152f 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index e07feb2..9da02c1 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -158,6 +158,18 @@
_scaled_output.allocator()->allocate();
}
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ configure(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
void CLDeconvolutionLayer::run()
{
prepare();
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index b5e8fd9..e46647a 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,7 +66,7 @@
for(unsigned int i = 0; i < _num_inputs; i++)
{
_concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
depth_offset += inputs_vector.at(i)->info()->dimension(2);
}
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 2e52e8a..dbf71ac 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -28,8 +28,8 @@
#include <utility>
-using namespace arm_compute;
-
+namespace arm_compute
+{
void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
{
auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
@@ -41,3 +41,4 @@
{
return CLDepthConvertLayerKernel::validate(input, output, policy, shift);
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 497cdae..15cbfce 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,18 +26,21 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
-CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
- : _kernel(nullptr), _border_handler()
+CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _kernel(nullptr), _border_handler(), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), _reshape_weights(), _permuted_input(),
+ _permuted_weights(), _permuted_output(), _original_weights(nullptr), _needs_permute(false), _needs_weights_reshape(false), _is_prepared(false)
{
}
@@ -47,25 +50,79 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- if(input->info()->data_layout() == DataLayout::NCHW)
+ const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+
+ _needs_permute = is_nhwc && (depth_multiplier > 1);
+ _needs_weights_reshape = is_nhwc && (depth_multiplier == 1)
+ && is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_prepared = false;
+ _original_weights = weights;
+
+ ICLTensor *input_to_use = input;
+ const ICLTensor *weights_to_use = weights;
+ ICLTensor *output_to_use = output;
+
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ DepthwiseConvolutionReshapeInfo info;
+ info.c0 = 4;
+ info.transpose = is_stride_1 && is_dot8_supported;
+
+ if(_needs_permute)
{
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input_to_nchw.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ output_to_use = &_permuted_output;
+
_kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
}
+ else if(is_nhwc)
+ {
+ if(_needs_weights_reshape)
+ {
+ _reshape_weights.configure(weights, &_permuted_weights, info);
+ weights_to_use = &_permuted_weights;
+ }
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+ }
else
{
- _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
}
+ // Configure kernel
_kernel->set_target(CLScheduler::get().target());
- _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
+ // Permute output if needed
+ if(_needs_permute)
+ {
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permute_output_to_nhwc.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
// Configure border handler
PixelValue &&zero_value(0.f);
if(is_data_type_quantized_asymmetric(input->info()->data_type()))
{
zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
}
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+ _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
}
Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -75,23 +132,99 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- if(input->data_layout() == DataLayout::NCHW)
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ const bool needs_permute = is_nhwc && (depth_multiplier > 1);
+ const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ DepthwiseConvolutionReshapeInfo info;
+ info.c0 = 4;
+ info.transpose = is_stride_1 && is_dot8_supported;
+
+ if(needs_permute)
{
- return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+
+ permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
+
+ const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
+ const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
+ const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+ }
+ else if(is_nhwc)
+ {
+ if(needs_weights_reshape)
+ {
+ auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
+ act_info));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
}
- return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ return Status{};
}
void CLDepthwiseConvolutionLayer3x3::run()
{
+ prepare();
+
+ _memory_group.acquire();
+
+ if(_needs_permute)
+ {
+ _permute_input_to_nchw.run();
+ }
CLScheduler::get().enqueue(_border_handler);
CLScheduler::get().enqueue(*_kernel);
+
+ if(_needs_permute)
+ {
+ _permute_output_to_nhwc.run();
+ }
+
+ _memory_group.release();
+}
+
+void CLDepthwiseConvolutionLayer3x3::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_needs_permute)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_to_nchw.run();
+ _original_weights->mark_as_unused();
+ }
+
+ if(_needs_weights_reshape)
+ {
+ ARM_COMPUTE_ERROR_ON(_needs_permute);
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ _permuted_weights.allocator()->allocate();
+ CLScheduler::get().enqueue(_reshape_weights);
+ _original_weights->mark_as_unused();
+ }
+ _is_prepared = true;
+ }
}
CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
- _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+ _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr),
+ _optimised_function(nullptr)
{
}
@@ -104,98 +237,110 @@
const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const size_t weights_w = weights->info()->dimension(idx_w);
- const size_t weights_h = weights->info()->dimension(idx_h);
- const size_t weights_z = weights->info()->dimension(idx_c);
+ const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
- _is_prepared = false;
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- bool append_bias = (biases != nullptr) && !_is_quantized;
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- // Calculate output shape
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
- // Output width and height
- const unsigned int conv_w = output_shape[idx_w];
- const unsigned int conv_h = output_shape[idx_h];
-
- // Set up intermediate tensors
- const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- // Im2Col configuration
- TensorShape shape_im2col = input->info()->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _im2col_kernel.set_target(gpu_target);
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
- CLScheduler::get().tune_kernel_static(_im2col_kernel);
-
- // Weights reshape configuration
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
-
- // GEMV configuration
- DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
- TensorShape shape_v2mm_out = input->info()->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
- _v2mm_kernel.set_target(gpu_target);
- _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- CLScheduler::get().tune_kernel_static(_v2mm_kernel);
- _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
-
- // Output staged configuration
- if(_is_quantized)
+ if(bool(can_run_optimised_3x3_kernel))
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
-
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
- _output_reshaped.allocator()->allocate();
+ auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
+ f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ _optimised_function = std::move(f);
}
-
- // Fill borders on inputs
- PixelValue zero_in(static_cast<int32_t>(0));
- PixelValue zero_w(static_cast<int32_t>(0));
- if(_is_quantized)
+ else
{
- zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
- zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
- }
- BorderSize border_size = _v2mm_kernel.border_size();
- _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+ const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- border_size.bottom = 0;
- _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+ const size_t weights_w = weights->info()->dimension(idx_w);
+ const size_t weights_h = weights->info()->dimension(idx_h);
+ const size_t weights_z = weights->info()->dimension(idx_c);
- // Allocate intermediate tensors
- _input_reshaped.allocator()->allocate();
- _v2mm_output.allocator()->allocate();
+ _is_prepared = false;
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
+ bool append_bias = (biases != nullptr) && !_is_quantized;
+ const GPUTarget gpu_target = CLScheduler::get().target();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
+ // Calculate output shape
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+ // Output width and height
+ const unsigned int conv_w = output_shape[idx_w];
+ const unsigned int conv_h = output_shape[idx_h];
+
+ // Set up intermediate tensors
+ const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ // Im2Col configuration
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ _im2col_kernel.set_target(gpu_target);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+ CLScheduler::get().tune_kernel_static(_im2col_kernel);
+
+ // Weights reshape configuration
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+ _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
+
+ // GEMV configuration
+ DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
+ TensorShape shape_v2mm_out = input->info()->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ _v2mm_kernel.set_target(gpu_target);
+ _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+ CLScheduler::get().tune_kernel_static(_v2mm_kernel);
+ _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+
+ // Output staged configuration
+ if(_is_quantized)
+ {
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_reshaped.allocator()->allocate();
+ }
+
+ // Fill borders on inputs
+ PixelValue zero_in(static_cast<int32_t>(0));
+ PixelValue zero_w(static_cast<int32_t>(0));
+ if(_is_quantized)
+ {
+ zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
+ zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+ }
+ BorderSize border_size = _v2mm_kernel.border_size();
+ _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+
+ border_size.bottom = 0;
+ _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+
+ // Allocate intermediate tensors
+ _input_reshaped.allocator()->allocate();
+ _v2mm_output.allocator()->allocate();
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
}
@@ -204,55 +349,64 @@
{
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+ const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const bool append_bias = (biases != nullptr) && !is_quantized;
- const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
- const size_t weights_w = weights->dimension(idx_w);
- const size_t weights_h = weights->dimension(idx_h);
- const size_t weights_z = weights->dimension(idx_c);
- const unsigned int conv_w = output_shape[idx_w];
- const unsigned int conv_h = output_shape[idx_h];
- const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- TensorShape shape_im2col = input->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
-
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
-
- DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
- TensorShape shape_v2mm_out = input->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
- TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
-
- if(is_quantized)
+ if(can_run_optimised_3x3_kernel)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
- }
+ const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
- // Validate Activation Layer
- if(act_info.enabled())
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool append_bias = (biases != nullptr) && !is_quantized;
+ const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const size_t weights_w = weights->dimension(idx_w);
+ const size_t weights_h = weights->dimension(idx_h);
+ const size_t weights_z = weights->dimension(idx_c);
+ const unsigned int conv_w = output_shape[idx_w];
+ const unsigned int conv_h = output_shape[idx_h];
+ const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ TensorShape shape_im2col = input->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+ DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+ TensorShape shape_v2mm_out = input->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+ TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+ }
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+ }
+ else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
}
-
return Status{};
}
@@ -260,33 +414,48 @@
{
prepare();
- CLScheduler::get().enqueue(_im2col_kernel);
- CLScheduler::get().enqueue(_v2mm_input_fill_border);
- CLScheduler::get().enqueue(_v2mm_kernel);
- CLScheduler::get().enqueue(_vector_to_tensor_kernel);
- if(_is_quantized)
+ if(_optimised_function != nullptr)
{
- CLScheduler::get().enqueue(_output_stage_kernel);
+ _optimised_function->run();
}
- if(_is_activationlayer_enabled)
+ else
{
- _activationlayer_function.run();
+ CLScheduler::get().enqueue(_im2col_kernel);
+ CLScheduler::get().enqueue(_v2mm_input_fill_border);
+ CLScheduler::get().enqueue(_v2mm_kernel);
+ CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+ if(_is_quantized)
+ {
+ CLScheduler::get().enqueue(_output_stage_kernel);
+ }
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
}
void CLDepthwiseConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if(_optimised_function != nullptr)
{
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ _optimised_function->prepare();
+ }
+ else
+ {
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- CLScheduler::get().enqueue(_weights_reshape_kernel);
- CLScheduler::get().enqueue(_v2mm_weights_fill_border);
- _original_weights->mark_as_unused();
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+ _original_weights->mark_as_unused();
- CLScheduler::get().queue().finish();
- _is_prepared = true;
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
new file mode 100644
index 0000000..b7e9a68
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::RSQRT);
+ _kernel = std::move(k);
+}
+Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT);
+}
+
+void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::EXP);
+ _kernel = std::move(k);
+}
+Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
new file mode 100644
index 0000000..28f4b13
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "support/ToolchainSupport.h"
+#include <arm_compute/runtime/CL/functions/CLElementwiseOperations.h>
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace
+{
+void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+{
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+ }
+ }
+}
+} // namespace
+
+void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::ADD, input1, input2, output, policy);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+ return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy);
+}
+
+void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::SUB, input1, input2, output, policy);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_UNUSED(policy);
+ return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy);
+}
+
+void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::DIV, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output);
+}
+
+void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MAX, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MIN, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index baa0cf4..e91038f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,32 +33,42 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
#include "arm_compute/runtime/ITensorAllocator.h"
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
namespace
{
-inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
bool flag = true;
if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
{
- // COMPMID-852
- if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+ if((m > 1) && n < 16)
{
- constexpr float alpha = 3.2f;
- constexpr float fact0 = 1.51f;
- constexpr float fact1 = 1.66f;
- constexpr float ops = 12.0f;
- const float scale = k > 1024 ? 1.07f : 1.0f;
- flag = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+ flag = true;
}
else
{
- flag = false;
+ // COMPMID-852
+ if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+ {
+ constexpr float alpha = 3.2f;
+ constexpr float fact0 = 1.51f;
+ constexpr float fact1 = 1.66f;
+ constexpr float ops = 12.0f;
+ const float scale = k > 1024 ? 1.07f : 1.0f;
+ flag = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+ }
+ else
+ {
+ flag = false;
+ }
}
}
else
@@ -73,17 +83,19 @@
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
- _interleave_kernel(),
- _transpose_kernel(),
_mm_kernel(),
_ma_kernel(),
+ _reshape_lhs_kernel(),
+ _reshape_rhs_kernel(),
+ _mm_reshaped_kernel(),
_tmp_a(),
_tmp_b(),
_original_b(nullptr),
_is_interleaved_transposed(false),
_run_addition(false),
_reshape_b_only_on_first_run(false),
- _is_prepared(false)
+ _is_prepared(false),
+ _is_new_gemm_reshaped(false)
{
}
@@ -106,29 +118,52 @@
const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _interleave_kernel.set_target(gpu_target);
+ _reshape_lhs_kernel.set_target(gpu_target);
_mm_kernel.set_target(gpu_target);
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
- const int n = b->info()->dimension(0);
- const int k = a->info()->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
+ DataType data_type = a->info()->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
mult_transpose1xW_width = 4;
mult_interleave4x4_height = 2;
}
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = 16 / b->info()->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = true;
// Check if we need to reshape the matrix A and matrix B
_is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+ // Check if we can run the new reshaped GEMM
+ const auto workload = static_cast<float>((m * n) / 20.0f);
+ _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
+
+ const bool add_matrix_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
+
// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(_is_interleaved_transposed)
{
@@ -145,19 +180,37 @@
}
// _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
- // Configure interleave kernel
- _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d());
+ if(_is_new_gemm_reshaped)
+ {
+ GEMMLHSMatrixInfo lhs_info;
- // Configure transpose kernel
- _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+ depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Configure interleave kernel
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ // Configure transpose kernel
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ }
}
- // Configure and tune matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
- mult_transpose1xW_width, mult_interleave4x4_height,
- depth_output_gemm3d, reinterpret_input_as_3d),
- gemm_info.fp_mixed_precision());
- CLScheduler::get().tune_kernel_static(_mm_kernel);
+ if(!_is_new_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
+ GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
+ gemm_info.fp_mixed_precision());
+ CLScheduler::get().tune_kernel_static(_mm_kernel);
+ }
if(_is_interleaved_transposed)
{
@@ -170,7 +223,7 @@
}
// Configure matrix addition kernel
- if(beta != 0 && c != nullptr)
+ if(add_matrix_c && !use_fused_add)
{
_ma_kernel.configure(c, output, beta);
_run_addition = true;
@@ -197,13 +250,15 @@
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
@@ -211,9 +266,31 @@
mult_interleave4x4_height = 2;
}
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = 16 / b->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = true;
+
// Check if we need to reshape the matrix A and matrix B
const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+ // Check if we can run the new reshaped GEMM
+ const auto workload = static_cast<float>((m * n) / 20.0f);
+ const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+
+ const bool add_matrix_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+
// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(run_interleave_transpose)
{
@@ -227,19 +304,42 @@
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+ if(is_new_gemm_reshaped)
+ {
+ GEMMLHSMatrixInfo lhs_info;
- // Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+ depth_output_gemm3d, reinterpret_input_as_3d)));
+ }
+ else
+ {
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+ }
}
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+ if(!is_new_gemm_reshaped)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
+ run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+ }
- if(beta != 0 && c != nullptr)
+ if(add_matrix_c && !use_fused_add)
{
// Validate matrix addition kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -257,17 +357,24 @@
if(_is_interleaved_transposed)
{
// Run interleave kernel
- CLScheduler::get().enqueue(_interleave_kernel, false);
+ CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
- CLScheduler::get().enqueue(_transpose_kernel, false);
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
}
}
// Run matrix multiply kernel
- CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ if(_is_new_gemm_reshaped)
+ {
+ CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ }
// Run matrix addition kernel
if(_run_addition)
@@ -286,10 +393,11 @@
{
// Run transpose kernel and mark original weights tensor as unused
_tmp_b.allocator()->allocate();
- CLScheduler::get().enqueue(_transpose_kernel, false);
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
_original_b->mark_as_unused();
}
CLScheduler::get().queue().finish();
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 4694aa7..7105e85 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -93,7 +93,7 @@
CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
_original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
- _is_activationlayer_enabled(false), _is_prepared(false)
+ _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true)
{
}
@@ -101,7 +101,8 @@
int gemm_3d_depth)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col,
+ _run_addition));
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
@@ -125,13 +126,15 @@
}
else
{
+ // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+ const bool skip_bias_in_gemm = _run_addition || !_skip_im2col;
// Configure matrix multiply function
- _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+ _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
}
}
Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col)
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition)
{
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
@@ -156,8 +159,10 @@
}
else
{
+ // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+ const bool skip_bias_in_gemm = run_addition || !skip_im2col;
// Perform validation step on Matrix multiply function
- return CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+ return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
}
}
@@ -193,6 +198,8 @@
_skip_col2im = data_layout == DataLayout::NHWC;
_append_bias = (biases != nullptr) && (!_is_quantized);
_is_activationlayer_enabled = act_info.enabled();
+ // In case of F16, fused bias will be used in GEMM
+ _run_addition = (_skip_im2col) && (_append_bias) && (data_type != DataType::F16);
// Set the GPU target for im2col and col2im
_im2col_kernel.set_target(CLScheduler::get().target());
@@ -242,7 +249,7 @@
else if(_append_bias)
{
// Configure add bias kernel
- _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
+ _add_bias_kernel.configure(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE);
}
// Create GEMM output tensor
@@ -276,9 +283,9 @@
{
const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
- const float multiplier = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
+ const float multiplier = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
int min_activation = 0;
@@ -375,6 +382,8 @@
const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
const bool skip_col2im = data_layout == DataLayout::NHWC;
bool is_activationlayer_enabled = act_info.enabled();
+ // In case of F16, fused bias will be used in GEMM
+ const bool run_addition = (skip_im2col) && (append_bias) && (data_type != DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -429,10 +438,10 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
gemm_input_to_use = &im2col_reshaped_info;
}
- else if(append_bias)
+ else if(run_addition)
{
// Validate add bias kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE));
}
// Create GEMM output tensor
@@ -459,9 +468,9 @@
{
const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
- const float multiplier = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
+ const float multiplier = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
@@ -496,7 +505,7 @@
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition));
// Validate Col2Im
if(!skip_col2im)
@@ -537,7 +546,7 @@
_mm_gemm.run();
}
- if(_skip_im2col && _append_bias)
+ if(_run_addition)
{
CLScheduler::get().enqueue(_add_bias_kernel);
}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2d4d231..2a01db7 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,42 +31,25 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
namespace
{
-inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
- bool flag = true;
-
- if(gpu_target_is_in(gpu_target,
- GPUTarget::G71, GPUTarget::G72,
- GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT))
- {
- // COMPMID-852
- if(k > 256 && m > 4 && reshape_b_only_on_first_run)
- {
- flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
- }
- else
- {
- flag = false;
- }
- }
- else
- {
- flag = m > 1;
- }
-
- return flag;
+ return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
}
} // namespace
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_mm_kernel(),
+ _mm_reshaped_kernel(),
_mtx_a_reshape_kernel(),
_mtx_b_reshape_kernel(),
_mtx_a_reduction_kernel(),
@@ -81,7 +64,7 @@
_original_b(nullptr),
_a_offset(0),
_b_offset(0),
- _is_interleaved_transposed(true),
+ _is_gemm_reshaped(true),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
_fuse_output_stage(false)
@@ -108,23 +91,23 @@
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const bool unroll_block = dot8_supported(CLKernelLibrary::get().get_device());
- const int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
- const int n = b->info()->dimension(0);
- const int k = a->info()->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- constexpr int mult_transpose1xW_width = 1;
- constexpr int mult_interleave4x4_height = 1;
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
// Check if we need to reshape the matrix A and matrix B
- _is_interleaved_transposed = is_interleaved_transposed(m, n, k, _reshape_b_only_on_first_run, gpu_target);
+ _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
- if(_is_interleaved_transposed)
+ if(_is_gemm_reshaped)
{
// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
reinterpret_input_as_3d = false;
@@ -138,11 +121,14 @@
_memory_group.manage(&_tmp_b);
}
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
// Configure interleave kernel
- _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block);
+ _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
// Configure transpose kernel
- _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+ _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
}
// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
@@ -177,10 +163,16 @@
_memory_group.manage(&_mm_result_s32);
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
- mult_transpose1xW_width, mult_interleave4x4_height,
- depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
// Configure offset contribution kernel
_offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
@@ -190,17 +182,23 @@
}
else
{
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
- mult_transpose1xW_width, mult_interleave4x4_height,
- depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
// Configure offset contribution kernel
_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
}
// Allocate tensors
- if(_is_interleaved_transposed)
+ if(_is_gemm_reshaped)
{
_tmp_a.allocator()->allocate();
if(!_reshape_b_only_on_first_run)
@@ -233,18 +231,19 @@
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
- constexpr int mult_transpose1xW_width = 1;
- constexpr int mult_interleave4x4_height = 1;
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+ bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
// if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(reshape_matrices)
@@ -252,20 +251,24 @@
reinterpret_input_as_3d = false;
}
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
if(reshape_matrices)
{
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
// Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
// Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
}
TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -292,12 +295,22 @@
{
TensorInfo mm_result_s32_info{};
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32));
+ if(reshape_matrices)
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info));
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+ }
+ else
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+ }
// Validate offset contribution kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -309,9 +322,16 @@
}
else
{
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info));
-
+ if(reshape_matrices)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+ }
+ else
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+ }
// Validate offset contribution kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -329,7 +349,7 @@
_memory_group.acquire();
- if(_is_interleaved_transposed)
+ if(_is_gemm_reshaped)
{
// Run reshape matrix A
CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
@@ -348,7 +368,14 @@
}
// Run matrix multiply
- CLScheduler::get().enqueue(_mm_kernel, false);
+ if(_is_gemm_reshaped)
+ {
+ CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_mm_kernel, false);
+ }
// Run matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
@@ -374,7 +401,7 @@
{
if(!_is_prepared)
{
- if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+ if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
{
ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
@@ -395,3 +422,4 @@
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
new file mode 100644
index 0000000..459438e
--- /dev/null
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGather.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+}
+
+Status CLGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+ return CLGatherKernel::validate(input, indices, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 5dd1202..c50132e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,6 +40,7 @@
_memset_kernel(),
_padded_copy_kernel(),
_cpp_nms_kernel(),
+ _is_nhwc(false),
_deltas_permuted(),
_deltas_flattened(),
_scores_permuted(),
@@ -60,10 +61,11 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+ _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
const DataType data_type = deltas->info()->data_type();
- const int num_anchors = scores->info()->dimension(2);
- const int feat_width = scores->info()->dimension(0);
- const int feat_height = scores->info()->dimension(1);
+ const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int pre_nms_topN = info.pre_nms_topN();
const int post_nms_topN = info.post_nms_topN();
@@ -77,21 +79,37 @@
_deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
// Permute and reshape deltas
- _memory_group.manage(&_deltas_permuted);
- _memory_group.manage(&_deltas_flattened);
- _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
- _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
- _deltas_permuted.allocator()->allocate();
+ if(!_is_nhwc)
+ {
+ _memory_group.manage(&_deltas_permuted);
+ _memory_group.manage(&_deltas_flattened);
+ _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+ _deltas_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _memory_group.manage(&_deltas_flattened);
+ _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+ }
const TensorShape flatten_shape_scores(1, total_num_anchors);
_scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
// Permute and reshape scores
- _memory_group.manage(&_scores_permuted);
- _memory_group.manage(&_scores_flattened);
- _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
- _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
- _scores_permuted.allocator()->allocate();
+ if(!_is_nhwc)
+ {
+ _memory_group.manage(&_scores_permuted);
+ _memory_group.manage(&_scores_flattened);
+ _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+ _scores_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _memory_group.manage(&_scores_flattened);
+ _flatten_scores_kernel.configure(scores, &_scores_flattened);
+ }
// Bounding box transform
_memory_group.manage(&_all_proposals);
@@ -141,11 +159,12 @@
const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
- const int num_anchors = scores->dimension(2);
- const int feat_width = scores->dimension(0);
- const int feat_height = scores->dimension(1);
+ const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
const int num_images = scores->dimension(3);
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int values_per_roi = info.values_per_roi();
@@ -156,14 +175,21 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+ TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ if(scores->data_layout() == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+ }
TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
- TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
-
TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
@@ -236,9 +262,12 @@
CLScheduler::get().enqueue(_compute_anchors_kernel, false);
// Transpose and reshape the inputs
- CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+ if(!_is_nhwc)
+ {
+ CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+ CLScheduler::get().enqueue(_permute_scores_kernel, false);
+ }
CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
- CLScheduler::get().enqueue(_permute_scores_kernel, false);
CLScheduler::get().enqueue(_flatten_scores_kernel, false);
// Build the boxes
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 4f709d5..2e3c6d7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -32,8 +32,8 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
{
@@ -81,3 +81,4 @@
_memory_group.release();
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index a89c4e3..f01b1b8 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -110,9 +110,9 @@
_gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
_forget_gate_out2.allocator()->allocate();
_memory_group.manage(&_forget_gate_out5);
- _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+ _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+ _forget_gate_out1.allocator()->allocate();
CLTensor *forget_gate_out = &_forget_gate_out5;
-
if(lstm_params.has_peephole_opt())
{
_forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,17 +129,18 @@
{
_forget_gate_out3.allocator()->allocate();
}
- _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
// input_gate = 1 - forget_gate, with CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ CLTensor *input_gate_out = &_input_gate_out1;
if(lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
@@ -160,17 +161,23 @@
_gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
_input_gate_out2.allocator()->allocate();
_memory_group.manage(&_input_gate_out4);
- _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+ _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
+ input_gate_out = &_input_gate_out4;
if(_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out5);
_pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out4.allocator()->allocate();
_input_gate_out5.allocator()->allocate();
+ input_gate_out = &_input_gate_out1;
}
- _input_gate_out3.allocator()->allocate();
- _input_gate_out4.allocator()->allocate();
- _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ else
+ {
+ _input_gate_out1.allocator()->allocate();
+ }
+ _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -190,14 +197,13 @@
_gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
_cell_state_out2.allocator()->allocate();
_memory_group.manage(&_cell_state_out4);
- _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+ _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
_activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_cell_state_out4.allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _forget_gate_out1.allocator()->allocate();
- _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
// Perform clipping
@@ -223,7 +229,7 @@
_gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
_output2.allocator()->allocate();
_memory_group.manage(&_output5);
- _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
+ _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
_output3.allocator()->allocate();
CLTensor *output_gate_out = &_output5;
if(lstm_params.has_peephole_opt())
@@ -284,13 +290,13 @@
std::vector<ICLTensor *> scratch_inputs;
if(!lstm_params.has_cifg_opt())
{
- scratch_inputs.emplace_back(&_input_gate_out1);
+ scratch_inputs.emplace_back(input_gate_out);
}
scratch_inputs.emplace_back(&_cell_state_out1);
scratch_inputs.emplace_back(forget_gate_out);
scratch_inputs.emplace_back(output_gate_out);
_concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
- _input_gate_out1.allocator()->allocate();
+ input_gate_out->allocator()->allocate();
_cell_state_out1.allocator()->allocate();
forget_gate_out->allocator()->allocate();
output_gate_out->allocator()->allocate();
@@ -364,7 +370,7 @@
// Validate forget gate
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -396,7 +402,7 @@
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
// Validate cell state
@@ -544,4 +550,4 @@
_concat_scratch_buffer.run();
_memory_group.release();
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 7e5278f..559b57f 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,8 +28,8 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
#include "support/ToolchainSupport.h"
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 32d8f15..8489fab 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@
_norm_kernel.configure(input, output, norm_info);
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
}
Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index de43c7d..3aa1b1e 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,21 +34,21 @@
{
}
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
{
// Copy the input to the output
_copy_kernel.configure(input, output, padding);
// Set the pages of the output to zero
- _memset_kernel.configure(output, PixelValue());
+ _memset_kernel.configure(output, constant_value);
// Fill padding on the first two dimensions with zeros
- _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT);
+ _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
}
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding)
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
return Status{};
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 1809e6e..63f00ac 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -60,7 +60,7 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
return Status{};
@@ -90,7 +90,7 @@
_add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
_memory_group.manage(&_add_output);
- _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+ _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
_fully_connected_out.allocator()->allocate();
_gemm_output.allocator()->allocate();
@@ -127,4 +127,4 @@
_is_prepared = true;
}
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 0f480ee..7bb4178 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,7 @@
using namespace arm_compute;
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
{
// Configure ROI pooling kernel
auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
new file mode 100644
index 0000000..b2cd472
--- /dev/null
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRange.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLRange::configure(ICLTensor *output, const float start, const float end, const float step)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>();
+ k->set_target(CLScheduler::get().target());
+ k->configure(output, start, end, step);
+ _kernel = std::move(k);
+
+ // Tune kernels
+ CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status CLRange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+ return CLRangeKernel::validate(output, start, end, step);
+}
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 1016ff7..b2d0f81 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,22 +45,31 @@
_reduced_outs = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
_keep_dims = keep_dims;
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+
+ // Convert negative axis
+ for(unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
// Perform reduction for every axis
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
- out_shape.set(reduction_axis[i], 1);
+ out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
if(i == _reduction_ops - 1 && keep_dims)
{
- _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
}
else
{
_reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
_memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -77,11 +86,10 @@
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
- Coordinates axis_copy = reduction_axis;
- std::sort(axis_copy.begin(), axis_copy.begin() + _reduction_ops);
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
- out_shape.remove_dimension(axis_copy[i] - i);
+ out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
@@ -90,22 +98,43 @@
Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
- ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
- for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+ TensorShape out_shape = input->tensor_shape();
+
+ Coordinates axis_sorted = reduction_axis;
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+
+ // Convert negative axis
+ for(unsigned int i = 0; i < reduction_ops; ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis[i] > 3);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+ axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
+ }
+
+ std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
if(output->total_size() > 0 && keep_dims)
{
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
}
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+ if(keep_dims)
+ {
+ out_shape.set(axis_sorted[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_sorted[i] - i);
+ }
}
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
return Status{};
}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index c5447ff..3d82e3f 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,15 +56,19 @@
} // namespace
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
+ : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
{
}
Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-
- if(axis == 0 && !is_data_type_quantized(input->data_type()))
+ bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0;
+ if(is_serial)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+ }
+ else
{
// Create temporary tensor infos
auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
@@ -81,17 +85,25 @@
}
ReductionOperation first_kernel_op;
+ ReductionOperation intermediate_kernel_op;
ReductionOperation last_kernel_op;
switch(op)
{
case ReductionOperation::SUM:
case ReductionOperation::MEAN_SUM:
- first_kernel_op = ReductionOperation::SUM;
- last_kernel_op = op;
+ first_kernel_op = ReductionOperation::SUM;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
break;
case ReductionOperation::SUM_SQUARE:
- first_kernel_op = ReductionOperation::SUM_SQUARE;
- last_kernel_op = ReductionOperation::SUM;
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = ReductionOperation::SUM;
+ break;
+ case ReductionOperation::PROD:
+ first_kernel_op = ReductionOperation::PROD;
+ intermediate_kernel_op = ReductionOperation::PROD;
+ last_kernel_op = ReductionOperation::PROD;
break;
default:
ARM_COMPUTE_ERROR("Not supported");
@@ -103,17 +115,13 @@
// Validate ReductionOperation on intermediate stages
for(unsigned int i = 1; i < num_of_stages - 1; ++i)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
}
// Validate ReductionOperation on the last stage
const unsigned int last_stage = num_of_stages - 1;
ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
}
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
- }
return Status{};
}
@@ -122,65 +130,77 @@
{
_num_of_stages = calculate_number_of_stages(input->info(), axis);
_reduction_axis = axis;
- _is_quantized = is_data_type_quantized(input->info()->data_type());
+ _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
// Configure reduction operation kernels
_reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
// Create temporary tensors
- if(axis == 0 && !_is_quantized)
+ if(_is_serial)
+ {
+ _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+ }
+ else
{
_border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
- _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ _results_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
TensorShape shape{ input->info()->tensor_shape() };
for(unsigned int i = 0; i < _num_of_stages - 1; i++)
{
shape.set(0, ceil(shape.x() / 128.f));
- _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+ _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
}
// Apply ReductionOperation only on first kernel
- _memory_group.manage(_sums_vector.get());
+ _memory_group.manage(_results_vector.get());
ReductionOperation first_kernel_op;
+ ReductionOperation intermediate_kernel_op;
ReductionOperation last_kernel_op;
+ PixelValue pixelValue;
switch(op)
{
case ReductionOperation::SUM:
case ReductionOperation::MEAN_SUM:
- first_kernel_op = ReductionOperation::SUM;
- last_kernel_op = op;
+ first_kernel_op = ReductionOperation::SUM;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
+ pixelValue = PixelValue();
break;
case ReductionOperation::SUM_SQUARE:
- first_kernel_op = ReductionOperation::SUM_SQUARE;
- last_kernel_op = ReductionOperation::SUM;
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = ReductionOperation::SUM;
+ pixelValue = PixelValue();
+ break;
+ case ReductionOperation::PROD:
+ first_kernel_op = ReductionOperation::PROD;
+ intermediate_kernel_op = ReductionOperation::PROD;
+ last_kernel_op = ReductionOperation::PROD;
+ pixelValue = PixelValue(1, input->info()->data_type());
break;
default:
ARM_COMPUTE_ERROR("Not supported");
}
- _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
- _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+ _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
// Apply ReductionOperation on intermediate stages
for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
{
- _memory_group.manage(_sums_vector.get() + i);
- _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
- _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[i - 1].allocator()->allocate();
+ _memory_group.manage(_results_vector.get() + i);
+ _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
+ _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+ _results_vector[i - 1].allocator()->allocate();
}
// Apply ReductionOperation on the last stage
const unsigned int last_stage = _num_of_stages - 1;
const unsigned int input_width = input->info()->dimension(0);
- _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
- _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[last_stage - 1].allocator()->allocate();
- }
- else
- {
- _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+ _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+ _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+ _results_vector[last_stage - 1].allocator()->allocate();
}
}
@@ -188,7 +208,11 @@
{
_memory_group.acquire();
- if(_reduction_axis == 0 && !_is_quantized)
+ if(_is_serial)
+ {
+ CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+ }
+ else
{
for(unsigned int i = 0; i < _num_of_stages; ++i)
{
@@ -196,10 +220,6 @@
CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
}
}
- else
- {
- CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
- }
_memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
new file mode 100644
index 0000000..0f86b9f
--- /dev/null
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+
+#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>();
+ k->configure(input, output, axis);
+ _kernel = std::move(k);
+}
+
+Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+ return CLReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
new file mode 100644
index 0000000..90c368e
--- /dev/null
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSelect.h"
+
+#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>();
+ k->configure(c, x, y, output);
+ _kernel = std::move(k);
+}
+
+Status CLSelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+ return CLSelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index bef7eca..f630853 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -36,10 +36,10 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
// Get absolute end coordinates
- const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->info()->tensor_shape(), ends);
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
- k->configure(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+ k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
_kernel = std::move(k);
}
@@ -54,8 +54,8 @@
}));
// Get absolute end coordinates
- const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->tensor_shape(), ends);
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
- return CLStridedSliceKernel::validate(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+ return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 76c1e18..a24b72e 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -33,20 +33,19 @@
namespace arm_compute
{
CLSpaceToBatchLayer::CLSpaceToBatchLayer()
- : _space_to_batch_kernel(), _output(nullptr), _has_padding(false)
+ : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
{
}
void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
}
-
- _output = output;
_space_to_batch_kernel.configure(input, block_shape, paddings, output);
}
@@ -57,42 +56,35 @@
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
}
-
- _output = output;
_space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
}
Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
{
- return CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+ return Status{};
}
Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
const ITensorInfo *output)
{
- return CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+ return Status{};
}
void CLSpaceToBatchLayer::run()
{
// Zero out output only if we have paddings
- // TODO(micspy01): replace with memset once ready
if(_has_padding)
{
- _output->map(CLScheduler::get().queue(), true);
- if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
- {
- const uint8_t quantized_zero = _output->info()->quantization_info().offset;
- std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
- }
- else
- {
- memset(_output->buffer(), 0, _output->info()->total_size());
- }
- _output->unmap(CLScheduler::get().queue());
+ CLScheduler::get().enqueue(_memset_kernel, true);
}
-
CLScheduler::get().enqueue(_space_to_batch_kernel, true);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
new file mode 100644
index 0000000..71327fe
--- /dev/null
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <complex>
+
+#include "arm_compute/runtime/CL/functions/CLStackLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLStackLayer::CLStackLayer() // NOLINT
+ : _input(),
+ _stack_kernels(),
+ _num_inputs(0)
+{
+}
+
+void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+{
+ _num_inputs = input.size();
+ _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+ }
+}
+
+Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+ // Wrap around negative values
+ const size_t rank = input[0]->num_dimensions();
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+ const unsigned int num_inputs = input.size();
+
+ for(unsigned int i = 0; i < num_inputs; i++)
+ {
+ // All the tensors must have the same rank
+ ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+ // Validate Kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+ }
+
+ return Status{};
+}
+
+void CLStackLayer::run()
+{
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ CLScheduler::get().enqueue(_stack_kernels[i], false);
+ }
+}
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
new file mode 100644
index 0000000..ec6a4ab
--- /dev/null
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTile.h"
+
+#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>();
+ k->configure(input, output, multiples);
+ _kernel = std::move(k);
+}
+
+Status CLTile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+ return CLTileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
new file mode 100644
index 0000000..428d091
--- /dev/null
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLUnstack.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+ return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+ // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+ Coordinates slice_end;
+ slice_start.set_num_dimensions(input_num_dimensions);
+ slice_end.set_num_dimensions(input_num_dimensions);
+ for(size_t k = 0; k < input_num_dimensions; ++k)
+ {
+ slice_start.set(k, 0);
+ slice_end.set(k, -1);
+ }
+ slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+CLUnstack::CLUnstack() // NOLINT
+ : _num_slices(0),
+ _strided_slice_vector()
+{
+}
+
+void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+{
+ std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+ std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_axis(axis, input->info());
+ _num_slices = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+ _strided_slice_vector = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+ for(unsigned int slice = 0; slice < _num_slices; ++slice)
+ {
+ // Adjusts start and end coordinates to take a 2D slice at a time
+ slice_start.set(axis_u, slice);
+ _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ }
+}
+
+Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+ const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ for(size_t k = 0; k < num_slices; ++k)
+ {
+ slice_start.set(wrap_axis(axis, input), k);
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+ }
+ return Status{};
+}
+
+void CLUnstack::run()
+{
+ for(unsigned i = 0; i < _num_slices; ++i)
+ {
+ _strided_slice_vector[i].run();
+ }
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 46a2d80..d0801a6 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,8 +50,8 @@
ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
// Output auto inizialitation if not yet initialized
- TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ TensorInfo tmp_output_info = *output->clone();
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
switch(num_inputs)
@@ -90,7 +90,7 @@
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 1abcb67..069196e 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -84,8 +84,8 @@
} // namespace
CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
- _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+ : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr),
+ _is_prepared(false)
{
}
@@ -133,14 +133,7 @@
(input->info()->data_type() == DataType::F16)));
// Configure output transform
- _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
-
- // Configure activation layer
- _is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
- }
+ _output_transform.configure(&_batched_mm_output, biases, output, winograd_info, act_info);
// Allocate temporary tensors
_input0.allocator()->allocate();
@@ -216,11 +209,6 @@
// Run output transform
CLScheduler::get().enqueue(_output_transform);
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
-
_memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 09e8456..7361eb2 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,7 @@
auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
k->configure(input, output, winograd_info);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
}
Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
diff --git a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
new file mode 100644
index 0000000..cd97849
--- /dev/null
+++ b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+namespace
+{
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_gemm_reshaped(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+ bool lhs_interleave, bool rhs_interleave)
+{
+ GEMMLHSMatrixInfo lhs_info;
+ GEMMRHSMatrixInfo rhs_info;
+
+ // Configure GEMMLHSMatrixInfo
+ lhs_info.m0 = m0;
+ lhs_info.k0 = k0;
+ lhs_info.v0 = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
+ lhs_info.interleave = lhs_interleave;
+ lhs_info.transpose = false;
+
+ // Configure GEMMRHSMatrixInfo
+ rhs_info.n0 = n0;
+ rhs_info.k0 = lhs_info.k0;
+ rhs_info.h0 = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
+ rhs_info.interleave = rhs_interleave;
+ rhs_info.transpose = true;
+
+ return std::make_pair(lhs_info, rhs_info);
+}
+
+} // namespace
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+ ARM_COMPUTE_UNUSED(data_type);
+
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ // Configurations for Mali-G76
+ static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G76 =
+ {
+ { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G76_f32 },
+ { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G76_u8 }
+ };
+
+ // Configurations for Mali-G7x
+ static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G7x =
+ {
+ { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G7x_f32 },
+ { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G7x_u8 }
+ };
+
+ switch(gpu_target)
+ {
+ case GPUTarget::G76:
+ return (this->*gemm_reshaped_configs_G76[data_type])(m, n, k, b);
+ default:
+ return (this->*gemm_reshaped_configs_G7x[data_type])(m, n, k, b);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 5, 4, 4, 2, 16, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(dot8_supported(CLKernelLibrary::get().get_device()))
+ {
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 16, 2, 2, true, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, true, false);
+ }
+ }
+ else
+ {
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 8, 2, 2, true, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 6, 4, 4, 2, 2, true, true);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 4, 4, 2, 8, 16, false, false);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 16, 4, 1, false, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, false, true);
+ }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 2b179fd..5916bb4 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -190,15 +190,19 @@
return;
}
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
try
{
+#endif /* ARM_COMPUTE_EXCEPTIONS_ENABLED */
process_workloads(*_workloads, *_feeder, _info);
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
catch(...)
{
_current_exception = std::current_exception();
}
-
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
_job_complete = true;
lock.unlock();
_cv.notify_one();
@@ -250,18 +254,21 @@
info.thread_id = t;
process_workloads(workloads, feeder, info);
-
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
try
{
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
for(auto &thread : _threads)
{
thread.wait();
}
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
catch(const std::system_error &e)
{
std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
}
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
}
#endif /* DOXYGEN_SKIP_THIS */
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
new file mode 100644
index 0000000..79e619c
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -0,0 +1,682 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <list>
+
+namespace arm_compute
+{
+namespace
+{
+Status detection_layer_validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N].");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1");
+
+ const int num_priors = input_priorbox->tensor_shape()[0] / 4;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions.");
+
+ // Validate configured output
+ if(output->total_size() != 0)
+ {
+ const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output);
+ }
+
+ return Status{};
+}
+
+/** Function used to sort pair<float, T> in descend order based on the score (first) value.
+ */
+template <typename T>
+bool SortScorePairDescend(const std::pair<float, T> &pair1,
+ const std::pair<float, T> &pair2)
+{
+ return pair1.first > pair2.first;
+}
+
+/** Get location predictions from input_loc.
+ *
+ * @param[in] input_loc The input location prediction.
+ * @param[in] num The number of images.
+ * @param[in] num_priors number of predictions per class.
+ * @param[in] num_loc_classes number of location classes. It is 1 if share_location is true,
+ * and is equal to number of classes needed to predict otherwise.
+ * @param[in] share_location If true, all classes share the same location prediction.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
+ const int num_priors, const int num_loc_classes,
+ const bool share_location, std::vector<LabelBBox> &all_location_predictions)
+{
+ for(int i = 0; i < num; ++i)
+ {
+ for(int c = 0; c < num_loc_classes; ++c)
+ {
+ int label = share_location ? -1 : c;
+ if(all_location_predictions[i].find(label) == all_location_predictions[i].end())
+ {
+ all_location_predictions[i][label].resize(num_priors);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(all_location_predictions[i][label].size() != static_cast<size_t>(num_priors));
+ break;
+ }
+ }
+ }
+ for(int i = 0; i < num; ++i)
+ {
+ for(int p = 0; p < num_priors; ++p)
+ {
+ for(int c = 0; c < num_loc_classes; ++c)
+ {
+ const int label = share_location ? -1 : c;
+ const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4;
+ //xmin, ymin, xmax, ymax
+ all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
+ all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
+ all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
+ all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
+ }
+ }
+ }
+}
+
+/** Get confidence predictions from input_conf.
+ *
+ * @param[in] input_loc The input location prediction.
+ * @param[in] num The number of images.
+ * @param[in] num_priors Number of predictions per class.
+ * @param[in] num_loc_classes Number of location classes. It is 1 if share_location is true,
+ * and is equal to number of classes needed to predict otherwise.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
+ const int num_priors, const int num_classes,
+ std::vector<std::map<int, std::vector<float>>> &all_confidence_scores)
+{
+ std::vector<float> tmp_buffer;
+ tmp_buffer.resize(num * num_priors * num_classes);
+ for(int i = 0; i < num; ++i)
+ {
+ for(int c = 0; c < num_classes; ++c)
+ {
+ for(int p = 0; p < num_priors; ++p)
+ {
+ tmp_buffer[i * num_classes * num_priors + c * num_priors + p] =
+ *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
+ }
+ }
+ }
+ for(int i = 0; i < num; ++i)
+ {
+ for(int c = 0; c < num_classes; ++c)
+ {
+ all_confidence_scores[i][c].resize(num_priors);
+ all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors],
+ &tmp_buffer[i * num_classes * num_priors + c * num_priors + num_priors]);
+ }
+ }
+}
+
+/** Get prior boxes from input_priorbox.
+ *
+ * @param[in] input_priorbox The input location prediction.
+ * @param[in] num_priors Number of priors.
+ * @param[in] num_loc_classes number of location classes. It is 1 if share_location is true,
+ * and is equal to number of classes needed to predict otherwise.
+ * @param[out] all_prior_bboxes If true, all classes share the same location prediction.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_priorbox(const ITensor *input_priorbox,
+ const int num_priors,
+ std::vector<NormalizedBBox> &all_prior_bboxes,
+ std::vector<std::array<float, 4>> &all_prior_variances)
+{
+ for(int i = 0; i < num_priors; ++i)
+ {
+ all_prior_bboxes[i] =
+ {
+ {
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))
+ }
+ };
+ }
+
+ std::array<float, 4> var({ { 0, 0, 0, 0 } });
+ for(int i = 0; i < num_priors; ++i)
+ {
+ for(int j = 0; j < 4; ++j)
+ {
+ var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j)));
+ }
+ all_prior_variances[i] = var;
+ }
+}
+
+/** Decode a bbox according to a prior bbox.
+ *
+ * @param[in] prior_bbox The input prior bounding boxes.
+ * @param[in] prior_variance The corresponding input variance.
+ * @param[in] code_type The detection output code type used to decode the results.
+ * @param[in] variance_encoded_in_target If true, the variance is encoded in target.
+ * @param[in] clip_bbox If true, the results should be between 0.f and 1.f.
+ * @param[in] bbox The input bbox to decode
+ * @param[out] decode_bbox The decoded bboxes.
+ *
+ */
+void DecodeBBox(const NormalizedBBox &prior_bbox, const std::array<float, 4> &prior_variance,
+ const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target,
+ const bool clip_bbox, const NormalizedBBox &bbox, NormalizedBBox &decode_bbox)
+{
+ // if the variance is encoded in target, we simply need to add the offset predictions
+ // otherwise we need to scale the offset accordingly.
+ switch(code_type)
+ {
+ case DetectionOutputLayerCodeType::CORNER:
+ {
+ decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]);
+ decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]);
+ decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]);
+ decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]);
+
+ break;
+ }
+ case DetectionOutputLayerCodeType::CENTER_SIZE:
+ {
+ const float prior_width = prior_bbox[2] - prior_bbox[0];
+ const float prior_height = prior_bbox[3] - prior_bbox[1];
+
+ // Check if the prior width and height are right
+ ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
+ ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
+
+ const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
+ const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
+
+ const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
+ const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
+ const float decode_bbox_width = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
+ const float decode_bbox_height = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
+
+ decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f);
+ decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f);
+ decode_bbox[2] = (decode_bbox_center_x + decode_bbox_width / 2.f);
+ decode_bbox[3] = (decode_bbox_center_y + decode_bbox_height / 2.f);
+
+ break;
+ }
+ case DetectionOutputLayerCodeType::CORNER_SIZE:
+ {
+ const float prior_width = prior_bbox[2] - prior_bbox[0];
+ const float prior_height = prior_bbox[3] - prior_bbox[1];
+
+ // Check if the prior width and height are greater than 0
+ ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
+ ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
+
+ decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
+ decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
+ decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
+ decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
+
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type.");
+ }
+
+ if(clip_bbox)
+ {
+ for(auto &d_bbox : decode_bbox)
+ {
+ d_bbox = utility::clamp(d_bbox, 0.f, 1.f);
+ }
+ }
+}
+
+/** Do non maximum suppression given bboxes and scores.
+ *
+ * @param[in] bboxes The input bounding boxes.
+ * @param[in] scores The corresponding input confidence.
+ * @param[in] score_threshold The threshold used to filter detection results.
+ * @param[in] nms_threshold The threshold used in non maximum suppression.
+ * @param[in] eta Adaptation rate for nms threshold.
+ * @param[in] top_k If not -1, keep at most top_k picked indices.
+ * @param[out] indices The kept indices of bboxes after nms.
+ *
+ */
+void ApplyNMSFast(const std::vector<NormalizedBBox> &bboxes,
+ const std::vector<float> &scores, const float score_threshold,
+ const float nms_threshold, const float eta, const int top_k,
+ std::vector<int> &indices)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size.");
+
+ // Get top_k scores (with corresponding indices).
+ std::list<std::pair<float, int>> score_index_vec;
+
+ // Generate index score pairs.
+ for(size_t i = 0; i < scores.size(); ++i)
+ {
+ if(scores[i] > score_threshold)
+ {
+ score_index_vec.emplace_back(std::make_pair(scores[i], i));
+ }
+ }
+
+ // Sort the score pair according to the scores in descending order
+ score_index_vec.sort(SortScorePairDescend<int>);
+
+ // Keep top_k scores if needed.
+ const int score_index_vec_size = score_index_vec.size();
+ if(top_k > -1 && top_k < score_index_vec_size)
+ {
+ score_index_vec.resize(top_k);
+ }
+
+ // Do nms.
+ float adaptive_threshold = nms_threshold;
+ indices.clear();
+
+ while(!score_index_vec.empty())
+ {
+ const int idx = score_index_vec.front().second;
+ bool keep = true;
+ for(int kept_idx : indices)
+ {
+ if(keep)
+ {
+ // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
+ NormalizedBBox intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+ if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
+ {
+ intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+ }
+ else
+ {
+ intersect_bbox = std::array<float, 4>({ {
+ std::max(bboxes[idx][0], bboxes[kept_idx][0]),
+ std::max(bboxes[idx][1], bboxes[kept_idx][1]),
+ std::min(bboxes[idx][2], bboxes[kept_idx][2]),
+ std::min(bboxes[idx][3], bboxes[kept_idx][3])
+ }
+ });
+ }
+
+ float intersect_width = intersect_bbox[2] - intersect_bbox[0];
+ float intersect_height = intersect_bbox[3] - intersect_bbox[1];
+
+ float overlap = 0.f;
+ if(intersect_width > 0 && intersect_height > 0)
+ {
+ float intersect_size = intersect_width * intersect_height;
+ float bbox1_size = (bboxes[idx][2] < bboxes[idx][0]
+ || bboxes[idx][3] < bboxes[idx][1]) ?
+ 0.f :
+ (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
+ float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0]
+ || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ?
+ 0.f :
+ (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
+ overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size);
+ }
+ keep = (overlap <= adaptive_threshold);
+ }
+ else
+ {
+ break;
+ }
+ }
+ if(keep)
+ {
+ indices.push_back(idx);
+ }
+ score_index_vec.erase(score_index_vec.begin());
+ if(keep && eta < 1.f && adaptive_threshold > 0.5f)
+ {
+ adaptive_threshold *= eta;
+ }
+ }
+}
+
+Status non_max_suppression_validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+ const float score_threshold, const float nms_threshold)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, indices);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "Scores must be a 1D float tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(nms_threshold < 0.f || nms_threshold > 1.f, "Threshould must be in [0,1]");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(score_threshold < 0.f || score_threshold > 1.f, "Threshould must be in [0,1]");
+
+ return Status{};
+}
+} // namespace
+
+CPPNonMaximumSuppression::CPPNonMaximumSuppression()
+ : _bboxes(nullptr), _scores(nullptr), _indices(nullptr), _max_output_size(0), _score_threshold(0.f), _nms_threshold(0.f)
+{
+}
+
+void CPPNonMaximumSuppression::configure(
+ const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
+ const float score_threshold, const float nms_threshold)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(bboxes, scores, indices);
+ ARM_COMPUTE_ERROR_THROW_ON(non_max_suppression_validate_arguments(bboxes->info(), scores->info(), indices->info(), max_output_size, score_threshold, nms_threshold));
+
+ // copy scores also to a vector
+ _bboxes = bboxes;
+ _scores = scores;
+ _indices = indices;
+
+ _nms_threshold = nms_threshold;
+ _max_output_size = max_output_size;
+ _score_threshold = score_threshold;
+}
+
+Status CPPNonMaximumSuppression::validate(
+ const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+ const float score_threshold, const float nms_threshold)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(non_max_suppression_validate_arguments(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold));
+ return Status{};
+}
+
+void extract_bounding_boxes_from_tensor(const ITensor *bboxes, std::vector<NormalizedBBox> &bboxes_vector)
+{
+ Window input_win;
+ input_win.use_tensor_dimensions(bboxes->info()->tensor_shape());
+ input_win.set_dimension_step(0U, 4U);
+ input_win.set_dimension_step(1U, 1U);
+ Iterator input(bboxes, input_win);
+ auto f = [&bboxes_vector, &input](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+ bboxes_vector.push_back(NormalizedBBox({ { *input_ptr, *(input_ptr + 1), *(2 + input_ptr), *(3 + input_ptr) } }));
+ };
+ execute_window_loop(input_win, f, input);
+}
+
+void extract_scores_from_tensor(const ITensor *scores, std::vector<float> &scores_vector)
+{
+ Window window;
+ window.use_tensor_dimensions(scores->info()->tensor_shape());
+ Iterator it(scores, window);
+ auto f = [&it, &scores_vector](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float *>(it.ptr());
+ scores_vector.push_back(*input_ptr);
+ };
+ execute_window_loop(window, f, it);
+}
+
+void CPPNonMaximumSuppression::run()
+{
+ std::vector<NormalizedBBox> bboxes_vector;
+ std::vector<float> scores_vector;
+ std::vector<int> indices_vector;
+ extract_bounding_boxes_from_tensor(_bboxes, bboxes_vector);
+ extract_scores_from_tensor(_scores, scores_vector);
+ ApplyNMSFast(bboxes_vector, scores_vector, _score_threshold, _nms_threshold, 1, -1 /* disable top_k */, indices_vector);
+ std::copy_n(indices_vector.begin(), std::min(indices_vector.size(), _indices->info()->dimension(0)), reinterpret_cast<int *>(_indices->ptr_to_element(Coordinates(0))));
+}
+
+CPPDetectionOutputLayer::CPPDetectionOutputLayer()
+ : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
+ _all_prior_variances(), _all_decode_bboxes(), _all_indices()
+{
+}
+
+void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+ // Output auto initialization if not yet initialized
+ // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum
+ // The maximum is keep_top_k * input_loc_size[1]
+ // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax]
+ const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
+ auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(detection_layer_validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+
+ _input_loc = input_loc;
+ _input_conf = input_conf;
+ _input_priorbox = input_priorbox;
+ _output = output;
+ _info = info;
+ _num_priors = input_priorbox->info()->dimension(0) / 4;
+ _num = (_input_loc->info()->num_dimensions() > 1 ? _input_loc->info()->dimension(1) : 1);
+
+ _all_location_predictions.resize(_num);
+ _all_confidence_scores.resize(_num);
+ _all_prior_bboxes.resize(_num_priors);
+ _all_prior_variances.resize(_num_priors);
+ _all_decode_bboxes.resize(_num);
+
+ for(int i = 0; i < _num; ++i)
+ {
+ for(int c = 0; c < _info.num_loc_classes(); ++c)
+ {
+ const int label = _info.share_location() ? -1 : c;
+ if(label == _info.background_label_id())
+ {
+ // Ignore background class.
+ continue;
+ }
+ _all_decode_bboxes[i][label].resize(_num_priors);
+ }
+ }
+ _all_indices.resize(_num);
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+}
+
+Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(detection_layer_validate_arguments(input_loc, input_conf, input_priorbox, output, info));
+ return Status{};
+}
+
+void CPPDetectionOutputLayer::run()
+{
+ // Retrieve all location predictions.
+ retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions);
+
+ // Retrieve all confidences.
+ retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores);
+
+ // Retrieve all prior bboxes.
+ retrieve_all_priorbox(_input_priorbox, _num_priors, _all_prior_bboxes, _all_prior_variances);
+
+ // Decode all loc predictions to bboxes
+ const bool clip_bbox = false;
+ for(int i = 0; i < _num; ++i)
+ {
+ for(int c = 0; c < _info.num_loc_classes(); ++c)
+ {
+ const int label = _info.share_location() ? -1 : c;
+ if(label == _info.background_label_id())
+ {
+ // Ignore background class.
+ continue;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+
+ const std::vector<NormalizedBBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
+
+ const int num_bboxes = _all_prior_bboxes.size();
+ ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4);
+
+ for(int j = 0; j < num_bboxes; ++j)
+ {
+ DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]);
+ }
+ }
+ }
+
+ int num_kept = 0;
+
+ for(int i = 0; i < _num; ++i)
+ {
+ const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+ const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+
+ std::map<int, std::vector<int>> indices;
+ int num_det = 0;
+ for(int c = 0; c < _info.num_classes(); ++c)
+ {
+ if(c == _info.background_label_id())
+ {
+ // Ignore background class
+ continue;
+ }
+ const int label = _info.share_location() ? -1 : c;
+ if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
+ {
+ ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+ }
+ const std::vector<float> &scores = conf_scores.find(c)->second;
+ const std::vector<NormalizedBBox> &bboxes = decode_bboxes.find(label)->second;
+
+ ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]);
+
+ num_det += indices[c].size();
+ }
+
+ int num_to_add = 0;
+ if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
+ {
+ std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+ for(auto it : indices)
+ {
+ const int label = it.first;
+ const std::vector<int> &label_indices = it.second;
+
+ if(conf_scores.find(label) == conf_scores.end())
+ {
+ ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+ }
+
+ const std::vector<float> &scores = conf_scores.find(label)->second;
+ for(auto idx : label_indices)
+ {
+ ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
+ score_index_pairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
+ }
+ }
+
+ // Keep top k results per image.
+ std::sort(score_index_pairs.begin(), score_index_pairs.end(), SortScorePairDescend<std::pair<int, int>>);
+ score_index_pairs.resize(_info.keep_top_k());
+
+ // Store the new indices.
+
+ std::map<int, std::vector<int>> new_indices;
+ for(auto score_index_pair : score_index_pairs)
+ {
+ int label = score_index_pair.second.first;
+ int idx = score_index_pair.second.second;
+ new_indices[label].push_back(idx);
+ }
+ _all_indices[i] = new_indices;
+ num_to_add = _info.keep_top_k();
+ }
+ else
+ {
+ _all_indices[i] = indices;
+ num_to_add = num_det;
+ }
+ num_kept += num_to_add;
+ }
+
+ //Update the valid region of the ouput to mark the exact number of detection
+ _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept)));
+
+ int count = 0;
+ for(int i = 0; i < _num; ++i)
+ {
+ const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+ const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+ for(auto &it : _all_indices[i])
+ {
+ const int label = it.first;
+ const std::vector<float> &scores = conf_scores.find(label)->second;
+ const int loc_label = _info.share_location() ? -1 : label;
+ if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
+ {
+ // Either if there are no confidence predictions
+ // or there are no location predictions for current label.
+ ARM_COMPUTE_ERROR("Could not find predictions for the label %d.", label);
+ }
+ const std::vector<NormalizedBBox> &bboxes = decode_bboxes.find(loc_label)->second;
+ const std::vector<int> &indices = it.second;
+
+ for(auto idx : indices)
+ {
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7)))) = i;
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label;
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 2)))) = scores[idx];
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 3)))) = bboxes[idx][0];
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 4)))) = bboxes[idx][1];
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 5)))) = bboxes[idx][2];
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 6)))) = bboxes[idx][3];
+
+ ++count;
+ }
+ }
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
new file mode 100644
index 0000000..c4e1eab
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPTopKV.h"
+
+#include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+{
+ auto kernel = arm_compute::support::cpp14::make_unique<CPPTopKVKernel>();
+ kernel->configure(predictions, targets, output, k);
+ _kernel = std::move(kernel);
+}
+
+Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+{
+ return CPPTopKVKernel::validate(predictions, targets, output, k);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index ac19d08..f3355a7 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -39,7 +39,8 @@
#include <unistd.h>
#ifndef BARE_METAL
-#include <regex>
+/* C++ std::regex takes up a lot of space in the standalone builds */
+#include <regex.h>
#include <thread>
#endif /* BARE_METAL */
@@ -94,6 +95,7 @@
return false;
}
}
+
/* Convert an MIDR register value to a CPUModel enum value. */
CPUModel midr_to_model(const unsigned int midr)
{
@@ -144,6 +146,19 @@
break;
}
}
+ else if(implementer == 0x48) // HiSilicon CPUs
+ {
+ // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC"
+ switch(cpunum)
+ {
+ case 0xd40: // A76 (Kirin 980)
+ model = CPUModel::GENERIC_FP16_DOT;
+ break;
+ default:
+ model = CPUModel::GENERIC;
+ break;
+ }
+ }
return model;
}
@@ -172,12 +187,27 @@
void populate_models_cpuinfo(std::vector<CPUModel> &cpusv)
{
+ regex_t proc_regex;
+ regex_t imp_regex;
+ regex_t var_regex;
+ regex_t part_regex;
+ regex_t rev_regex;
+
+ memset(&proc_regex, 0, sizeof(regex_t));
+ memset(&imp_regex, 0, sizeof(regex_t));
+ memset(&var_regex, 0, sizeof(regex_t));
+ memset(&part_regex, 0, sizeof(regex_t));
+ memset(&rev_regex, 0, sizeof(regex_t));
+
+ int ret_status = 0;
// If "long-form" cpuinfo is present, parse that to populate models.
- std::regex proc_regex(R"(^processor.*(\d+)$)");
- std::regex imp_regex(R"(^CPU implementer.*0x(..)$)");
- std::regex var_regex(R"(^CPU variant.*0x(.)$)");
- std::regex part_regex(R"(^CPU part.*0x(...)$)");
- std::regex rev_regex(R"(^CPU revision.*(\d+)$)");
+ ret_status |= regcomp(&proc_regex, R"(^processor.*([[:digit:]]+)$)", REG_EXTENDED);
+ ret_status |= regcomp(&imp_regex, R"(^CPU implementer.*0x(..)$)", REG_EXTENDED);
+ ret_status |= regcomp(&var_regex, R"(^CPU variant.*0x(.)$)", REG_EXTENDED);
+ ret_status |= regcomp(&part_regex, R"(^CPU part.*0x(...)$)", REG_EXTENDED);
+ ret_status |= regcomp(&rev_regex, R"(^CPU revision.*([[:digit:]]+)$)", REG_EXTENDED);
+ ARM_COMPUTE_UNUSED(ret_status);
+ ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
std::ifstream file;
file.open("/proc/cpuinfo", std::ios::in);
@@ -190,11 +220,11 @@
while(bool(getline(file, line)))
{
- std::smatch match;
-
- if(std::regex_match(line, match, proc_regex))
+ regmatch_t match[2];
+ ret_status = regexec(&proc_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- std::string id = match[1];
+ std::string id = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
int newcpu = support::cpp11::stoi(id, nullptr);
if(curcpu >= 0 && midr == 0)
@@ -214,32 +244,44 @@
continue;
}
- if(std::regex_match(line, match, imp_regex))
+ ret_status = regexec(&imp_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- int impv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+ std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+ int impv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
midr |= (impv << 24);
+
continue;
}
- if(std::regex_match(line, match, var_regex))
+ ret_status = regexec(&var_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- int varv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+ std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+ int varv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
midr |= (varv << 20);
+
continue;
}
- if(std::regex_match(line, match, part_regex))
+ ret_status = regexec(&part_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- int partv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+ std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+ int partv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
midr |= (partv << 4);
+
continue;
}
- if(std::regex_match(line, match, rev_regex))
+ ret_status = regexec(&rev_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- int regv = support::cpp11::stoi(match[1], nullptr);
+ std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+ int regv = support::cpp11::stoi(subexp, nullptr);
midr |= (regv);
midr |= (0xf << 16);
+
continue;
}
}
@@ -249,6 +291,13 @@
cpusv[curcpu] = midr_to_model(midr);
}
}
+
+ // Free allocated memory
+ regfree(&proc_regex);
+ regfree(&imp_regex);
+ regfree(&var_regex);
+ regfree(&part_regex);
+ regfree(&rev_regex);
}
int get_max_cpus()
@@ -364,8 +413,11 @@
std::map<std::string, unsigned int> cpu_part_occurrence_map;
// CPU part regex
- std::regex cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
- std::smatch cpu_part_match;
+ regex_t cpu_part_rgx;
+ memset(&cpu_part_rgx, 0, sizeof(regex_t));
+ int ret_status = regcomp(&cpu_part_rgx, R"(.*CPU part.+/?\:[[:space:]]+([[:alnum:]]+).*)", REG_EXTENDED);
+ ARM_COMPUTE_UNUSED(ret_status);
+ ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
// Read cpuinfo and get occurrence of each core
std::ifstream cpuinfo;
@@ -375,9 +427,11 @@
std::string line;
while(bool(getline(cpuinfo, line)))
{
- if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
+ regmatch_t match[2];
+ ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- std::string cpu_part = cpu_part_match[1];
+ std::string cpu_part = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
{
cpu_part_occurrence_map[cpu_part]++;
@@ -389,6 +443,7 @@
}
}
}
+ regfree(&cpu_part_rgx);
// Get min number of threads
auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index c58d184..a35a18a 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -170,7 +170,7 @@
{
BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
input->info()->extend_padding(border_size);
- _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue(0)); // for PAD of im2col fp16: consider it as border
+ _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue()); // for PAD of im2col fp16: consider it as border
}
// Configure im2col
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
index 689d8be..aa937a6 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,7 +53,7 @@
for(unsigned int i = 0; i < _num_inputs; i++)
{
_concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
depth_offset += inputs_vector.at(i)->info()->dimension(2);
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index d9aa50d..ba05838 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@
_kernel = std::move(k);
// Configure border handler
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
_shift_handler.configure(input);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index c0cf098..cb14b8a 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,7 +68,7 @@
return;
}
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
_shift_handler.configure(input);
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index b2e69ee..2569365 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,7 +48,7 @@
_norm_kernel.configure(input, &_squared_input, output, norm_info);
_multiply_kernel.configure(input, input, &_squared_input, 1.0f);
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
// Allocate intermediate buffers
_squared_input.allocator()->allocate();
diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index 7d928d6..97c20d1 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp
@@ -59,7 +59,7 @@
// Check if there is a free blob
if(_free_blobs.empty())
{
- _occupied_blobs.emplace_front(Blob{ obj, 0, { obj } });
+ _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } });
}
else
{
@@ -71,7 +71,7 @@
_active_elements.insert(std::make_pair(obj, obj));
}
-void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size)
+void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size, size_t alignment)
{
ARM_COMPUTE_ERROR_ON(obj == nullptr);
@@ -80,10 +80,11 @@
ARM_COMPUTE_ERROR_ON(active_object_it == std::end(_active_elements));
// Update object fields and mark object as complete
- Element &el = active_object_it->second;
- el.handle = &obj_memory;
- el.size = size;
- el.status = true;
+ Element &el = active_object_it->second;
+ el.handle = &obj_memory;
+ el.size = size;
+ el.alignment = alignment;
+ el.status = true;
// Find object in the occupied lists
auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
@@ -94,8 +95,9 @@
// Update occupied blob and return as free
occupied_blob_it->bound_elements.insert(obj);
- occupied_blob_it->max_size = std::max(occupied_blob_it->max_size, size);
- occupied_blob_it->id = nullptr;
+ occupied_blob_it->max_size = std::max(occupied_blob_it->max_size, size);
+ occupied_blob_it->max_alignment = std::max(occupied_blob_it->max_alignment, alignment);
+ occupied_blob_it->id = nullptr;
_free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
// Check if all object are finalized and reset active group
diff --git a/src/runtime/MEMUtils.cpp b/src/runtime/MEMUtils.cpp
index ad00070..5ae1c2a 100644
--- a/src/runtime/MEMUtils.cpp
+++ b/src/runtime/MEMUtils.cpp
@@ -27,7 +27,7 @@
#ifndef BARE_METAL
#include <fstream>
-#include <regex>
+#include <iterator>
#include <sstream>
#endif // ifndef BARE_METAL
@@ -43,41 +43,33 @@
size_t memfree = 0;
std::ifstream meminfo_f;
meminfo_f.open("/proc/meminfo", std::ios::in);
+
if(meminfo_f.is_open())
{
- std::stringstream str_stream;
- str_stream << meminfo_f.rdbuf();
- const std::string str = str_stream.str();
- try
+ std::string line;
+ while(bool(getline(meminfo_f, line)))
{
- std::smatch match;
- if(std::regex_search(str, match, std::regex("MemTotal: (.*)kB")) && match.size() > 1)
+ std::istringstream iss(line);
+ std::vector<std::string> tokens((std::istream_iterator<std::string>(iss)),
+ std::istream_iterator<std::string>());
+ if(tokens[0] == "MemTotal:")
{
- const std::string result = match.str(1);
- total = std::stoul(result, nullptr, 0);
+ total = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
}
- if(std::regex_search(str, match, std::regex("MemFree: (.*)kB")) && match.size() > 1)
+ else if(tokens[0] == "MemFree:")
{
- const std::string result = match.str(1);
- memfree = std::stoul(result, nullptr, 0);
+ memfree = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
}
- if(std::regex_search(str, match, std::regex("Buffers: (.*)kB")) && match.size() > 1)
+ else if(tokens[0] == "Buffers:")
{
- const std::string result = match.str(1);
- buffer = std::stoul(result, nullptr, 0);
+ buffer = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
}
- if(std::regex_search(str, match, std::regex("Cached: (.*)kB")) && match.size() > 1)
+ else if(tokens[0] == "Cached:")
{
- const std::string result = match.str(1);
- memcache = std::stoul(result, nullptr, 0);
+ memcache = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
}
- free = memfree + (buffer + memcache);
}
- catch(std::regex_error &e)
- {
- // failed parsing /proc/meminfo
- // return 0s on all fields
- }
+ free = memfree + (buffer + memcache);
}
#endif // ifndef BARE_METAL
}
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
new file mode 100644
index 0000000..1287204
--- /dev/null
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+INESimpleFunctionNoBorder::INESimpleFunctionNoBorder() // NOLINT
+ : _kernel()
+{
+}
+
+void INESimpleFunctionNoBorder::run()
+{
+ NEScheduler::get().schedule(_kernel.get(), Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
new file mode 100644
index 0000000..d33e134
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false)
+{
+}
+void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
+{
+ _reduction_kernel.configure(input, output, axis, op);
+
+ if(axis == 0)
+ {
+ _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE);
+ _run_fill_border = true;
+ }
+}
+
+Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+ return Status{};
+}
+
+void NEArgMinMaxLayer::run()
+{
+ _memory_group.acquire();
+
+ if(_run_fill_border)
+ {
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ }
+ NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
+
+ _memory_group.release();
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 677e9f6..b155077 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,16 +36,6 @@
auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
}
Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
{
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 931e5db..5059162 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -148,8 +148,7 @@
return (*found).second;
}
- if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
- || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
+ if(dilation != Size2D(1U, 1U) || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
{
return ConvolutionMethod::GEMM;
}
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 6887a0a..44d7197 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -145,6 +145,15 @@
_conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
_scaled_output.allocator()->allocate();
}
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info)
+{
+ return NEDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0);
+}
+
+void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info)
+{
+ configure(input, weights, bias, output, info, 0, 0);
+}
void NEDeconvolutionLayer::run()
{
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a2f0094..f0fd4cf 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,7 +72,7 @@
accum_layout = DataLayout::NCHW;
}
- _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, input->info()->quantization_info()));
+ _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
_accumulator.info()->set_data_layout(accum_layout);
zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
}
@@ -271,7 +271,7 @@
const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_UNUSED(channel_idx);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
new file mode 100644
index 0000000..74c1957
--- /dev/null
+++ b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
+#include <arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h>
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MAX, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MIN, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+
+void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEDivisionOperationKernel>();
+ k->configure(input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEDivisionOperationKernel::validate(input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
+ k->configure(COP, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+template <ComparisonOperation COP>
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEComparisonOperationKernel::validate(COP, input1, input2, output);
+}
+
+void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
+ k->configure(op, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+{
+ return NEComparisonOperationKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
new file mode 100644
index 0000000..10142c7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NERsqrtLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
+ k->configure(ElementWiseUnary::RSQRT, input, output);
+ _kernel = std::move(k);
+}
+Status NERsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NEElementwiseUnaryKernel::validate(ElementWiseUnary::RSQRT, input, output);
+}
+
+void NEExpLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
+ k->configure(ElementWiseUnary::EXP, input, output);
+ _kernel = std::move(k);
+}
+Status NEExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NEElementwiseUnaryKernel::validate(ElementWiseUnary::EXP, input, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
new file mode 100644
index 0000000..dc48731
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEFuseBatchNormalization::NEFuseBatchNormalization()
+ : _fuse_bn_kernel()
+{
+}
+
+void NEFuseBatchNormalization::configure(const ITensor *conv_weights, const ITensor *bn_mean, const ITensor *bn_var,
+ ITensor *fused_weights, ITensor *fused_bias,
+ const ITensor *conv_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
+ float epsilon)
+{
+ _fuse_bn_kernel.configure(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+Status NEFuseBatchNormalization::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+ const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+ float epsilon)
+{
+ return NEFuseBatchNormalizationKernel::validate(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+void NEFuseBatchNormalization::run()
+{
+ NEScheduler::get().schedule(&_fuse_bn_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 72a3e80..914f088 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,8 +91,8 @@
shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ TensorInfo info_a = a->info()->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
+ TensorInfo info_b = b->info()->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 922f757..470e922 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,6 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
#include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
@@ -38,14 +35,14 @@
{
namespace
{
-std::unique_ptr<IFunction> create_function_all_types(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+std::unique_ptr<IFunction> create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info,
+ const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
std::shared_ptr<IMemoryManager> memory_manager)
{
//Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
- switch(method)
+ switch(gemm_kernel_info.method)
{
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED_FP16:
case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
{
if(!pretranspose_hint)
@@ -56,99 +53,41 @@
function->configure(a, b, d, alpha, beta, pretranspose_hint);
return std::move(function);
}
- default:
- return nullptr;
- }
-}
-
-template <typename TypeInput, typename TypeOutput>
-std::unique_ptr<IFunction> create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- ARM_COMPUTE_UNUSED(method);
- ARM_COMPUTE_UNUSED(a);
- ARM_COMPUTE_UNUSED(b);
- ARM_COMPUTE_UNUSED(d);
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(beta);
- ARM_COMPUTE_UNUSED(pretranspose_hint);
- ARM_COMPUTE_UNUSED(memory_manager);
- return nullptr;
-}
-
-#ifdef __aarch64__
-template <>
-std::unique_ptr<IFunction> create_function<int8_t, int32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- switch(method)
- {
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
- {
- if(!pretranspose_hint)
- {
- return nullptr;
- }
- auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
- function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
- return std::move(function);
- }
- default:
- return nullptr;
- }
- return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<uint8_t, uint32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- switch(method)
- {
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
- {
- if(!pretranspose_hint)
- {
- return nullptr;
- }
- auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
- function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
- return std::move(function);
- }
- default:
- return nullptr;
- }
- return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<float, float>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- ARM_COMPUTE_UNUSED(pretranspose_hint);
- ARM_COMPUTE_UNUSED(memory_manager);
- switch(method)
- {
+#if defined(__aarch64__)
case arm_gemm::GemmMethod::GEMM_NATIVE:
{
- auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
- kernel->configure(a, b, d, alpha, beta);
- auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
- function->configure(std::move(kernel));
- return std::move(function);
+ if(gemm_kernel_info.name.find("sgemm_native_16x4") != std::string::npos)
+ {
+ auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
+ kernel->configure(a, b, d, alpha, beta);
+ auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
+ function->configure(std::move(kernel));
+ return std::move(function);
+ }
+ return nullptr;
}
+#endif // defined(__aarch64__)
default:
return nullptr;
}
}
-#endif /* __aarch64__ */
/** Fallback in case ACL doesn't have a function */
template <typename TypeInput, typename TypeOutput>
class Fallback : public NEGEMMAssemblyDispatch::IFallback
{
public:
- void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group);
+ /** Initialise the functions's input and output.
+ *
+ * @param[in] a Input tensor containing the Matrix A.
+ * @param[in] b Input tensor containing the Matrix B.
+ * @param[out] d Output tensor to store the result of matrix multiplication.
+ * @param[in] args Matrix multiplication information.
+ * @param[in] memory_group Memory group to be used by the function.
+ */
+ void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> args, MemoryGroup &memory_group);
+
+ // Inherited methods overridden:
void run() override;
void prepare() override;
bool is_configured() const override;
@@ -187,9 +126,16 @@
};
template <typename TypeInput, typename TypeOutput>
-void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group)
+void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> args, MemoryGroup &memory_group)
{
- _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args, nullptr);
+ arm_gemm::GemmConfig gemm_cfg;
+ const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args);
+ if(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
+ {
+ gemm_cfg.filter = gemm_kernel_info.name;
+ args._cfg = &gemm_cfg;
+ }
+ _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args);
if(_gemm_kernel_asm == nullptr)
{
//configuration not supported: Leave function unconfigured:
@@ -199,7 +145,7 @@
// arm_compute wrapper for the Gemm object (see above)
std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>();
ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
- acl_gemm_wrapper->configure(_gemm_kernel_asm.get());
+ acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
const size_t workspace_size = _gemm_kernel_asm->get_working_size();
if(workspace_size > 0)
{
@@ -229,8 +175,6 @@
const unsigned int alignment = 128;
const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
_pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
- _pretranspose.allocator()->allocate();
- ARM_COMPUTE_ERROR_ON_NULLPTR(_pretranspose.buffer());
}
}
@@ -242,6 +186,7 @@
// Pretranspose B if required
if(_gemm_kernel_asm->B_pretranspose_required())
{
+ _pretranspose.allocator()->allocate();
ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr);
const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
@@ -335,12 +280,8 @@
arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
//Try to create an ACL function:
- acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
- // If the type agnostic factory failed to create an ACL function, try the specialised one:
- if(acl_function == nullptr)
- {
- acl_function = create_function<TypeInput, TypeOutput>(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
- }
+ acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, std::move(memory_manager));
+
//If we still don't have an ACL function:
if(acl_function == nullptr)
{
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 0232a83..be7cc2d 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -90,7 +90,7 @@
}
NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+ : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
_add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
_skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 4b02694..5286f11 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -97,9 +97,9 @@
else
{
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());
+ TensorInfo info_a = a->info()->clone()->set_tensor_shape(compute_interleaved_shape(*a->info())).set_is_resizable(true);
// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());
+ TensorInfo info_b = b->info()->clone()->set_tensor_shape(compute_transpose1xW_shape(*b->info())).set_is_resizable(true);
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
_memory_group.manage(&_tmp_a);
@@ -241,8 +241,8 @@
shape_tmp_b.set(0, b->dimension(1) * 16);
shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
- TensorInfo info_a(shape_tmp_a, 1, a->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+ TensorInfo info_a = a->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
+ TensorInfo info_b = b->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
new file mode 100644
index 0000000..078bd5a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGather.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGather.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGather::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEGatherKernel>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+}
+
+Status NEGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+ return NEGatherKernel::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index fa8aaeb..8645b43 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,5 +36,5 @@
auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
k->configure(input, output);
_kernel = std::move(k);
- _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
+ _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
}
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index d0b80fb..56da966 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -26,8 +26,8 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
{
@@ -57,8 +57,8 @@
ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
- // Reduce shape on axis (supported axis is 0)
- shape.set(0, 1);
+ // Reduce shape on axis
+ shape.set(axis, 1);
sum_sq.set_tensor_shape(shape);
ARM_COMPUTE_RETURN_ON_ERROR(NEL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
@@ -75,3 +75,4 @@
_memory_group.release();
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 7c7580a..9e7a713 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,8 +111,8 @@
_forget_gate_out2.allocator()->allocate();
_memory_group.manage(&_forget_gate_out5);
_accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+ _forget_gate_out1.allocator()->allocate();
Tensor *forget_gate_out = &_forget_gate_out5;
-
if(lstm_params.has_peephole_opt())
{
_forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,18 +129,18 @@
{
_forget_gate_out3.allocator()->allocate();
}
- _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
- forget_gate_out->allocator()->allocate();
+ _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
// input_gate = 1 - forget_gate, with CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ Tensor *input_gate_out = &_input_gate_out1;
if(lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _subtract_input_gate.configure(&_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
@@ -162,16 +162,22 @@
_input_gate_out2.allocator()->allocate();
_memory_group.manage(&_input_gate_out4);
_accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
+ input_gate_out = &_input_gate_out4;
if(_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out5);
_pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out4.allocator()->allocate();
_input_gate_out5.allocator()->allocate();
+ input_gate_out = &_input_gate_out1;
}
- _input_gate_out3.allocator()->allocate();
- _input_gate_out4.allocator()->allocate();
- _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ else
+ {
+ _input_gate_out1.allocator()->allocate();
+ }
+ _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -194,11 +200,9 @@
_accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
_activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _input_gate_out1.allocator()->allocate();
+ _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_cell_state_out4.allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _forget_gate_out1.allocator()->allocate();
+ _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
@@ -246,7 +250,6 @@
_output1.allocator()->allocate();
}
_activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
- output_gate_out->allocator()->allocate();
// Configure block that calculates the output state
/** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -265,6 +268,7 @@
_activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
_pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_cell_state_activation.allocator()->allocate();
+ output_gate_out->allocator()->allocate();
if(lstm_params.has_projection())
{
@@ -281,19 +285,22 @@
// Copy cell state and output
_copy_cell_state.configure(&_cell_state_out1, cell_state_out);
- _cell_state_out1.allocator()->allocate();
_copy_output.configure(output_state_out, output);
// Vector for holding the tensors to store in scratch buffer
std::vector<ITensor *> scratch_inputs;
if(!lstm_params.has_cifg_opt())
{
- scratch_inputs.emplace_back(&_input_gate_out1);
+ scratch_inputs.emplace_back(input_gate_out);
}
scratch_inputs.emplace_back(&_cell_state_out1);
scratch_inputs.emplace_back(forget_gate_out);
scratch_inputs.emplace_back(output_gate_out);
_concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+ input_gate_out->allocator()->allocate();
+ _cell_state_out1.allocator()->allocate();
+ forget_gate_out->allocator()->allocate();
+ output_gate_out->allocator()->allocate();
}
Status NELSTMLayer::validate(const ITensorInfo *input,
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
new file mode 100644
index 0000000..f5c2718
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+TensorInfo get_expected_output_tensorinfo(const ITensorInfo &input, const PaddingList &paddings)
+{
+ const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input.tensor_shape(), paddings);
+ const TensorInfo expected_output_info = input.clone()->set_tensor_shape(expected_output_shape);
+ return expected_output_info;
+}
+
+Status validate_arguments(const ITensorInfo &input, ITensorInfo &output, const PaddingList &paddings)
+{
+ const TensorInfo expected_output_info = get_expected_output_tensorinfo(input, paddings);
+ auto_init_if_empty(output, expected_output_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output, &expected_output_info);
+
+ return Status{};
+}
+
+Coordinates get_subtensor_coords(const PaddingList &paddings)
+{
+ Coordinates coords;
+ for(unsigned int i = 0; i < paddings.size(); ++i)
+ {
+ coords.set(i, paddings[i].first);
+ }
+
+ return coords;
+}
+} // namespace
+
+NEPadLayer::NEPadLayer()
+ : _memset_kernel(), _copy_kernel(), _output_subtensor()
+{
+}
+
+void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, PixelValue constant_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_THROW_ON_ERROR(NEPadLayer::validate(input->info(), output->info(), padding, constant_value));
+
+ // Auto-init
+ auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
+
+ // Create SubTensor (Can use sub-tensor as the kernels to be executed do not require padding)
+ _output_subtensor = SubTensor(output, input->info()->tensor_shape(), get_subtensor_coords(padding), true);
+
+ // Set the pages of the output to the specified value
+ _memset_kernel.configure(output, constant_value);
+
+ // Copy the input to the output
+ _copy_kernel.configure(input, &_output_subtensor);
+}
+
+Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+{
+ ARM_COMPUTE_UNUSED(constant_value);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ auto output_clone = output->clone();
+
+ SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
+ ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+
+ return Status{};
+}
+
+void NEPadLayer::run()
+{
+ NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index 1f1400c..3aca4b7 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,14 +27,14 @@
#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
NEROIPoolingLayer::NEROIPoolingLayer()
: _roi_kernel()
{
}
-void NEROIPoolingLayer::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
{
_roi_kernel.configure(input, rois, output, pool_info);
}
@@ -43,3 +43,4 @@
{
NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
new file mode 100644
index 0000000..977d502
--- /dev/null
+++ b/src/runtime/NEON/functions/NERange.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NERange.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERange::NERange()
+ : _kernel()
+{
+}
+
+void NERange::configure(ITensor *output, const float start, const float end, const float step)
+{
+ _kernel.configure(output, start, end, step);
+}
+
+Status NERange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+ return NERangeKernel::validate(output, start, end, step);
+}
+
+void NERange::run()
+{
+ NEScheduler::get().schedule(&_kernel, Window::DimX);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 0b022df..014895f 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -14,9 +14,9 @@
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
@@ -39,17 +39,38 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
- for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
- {
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
- }
+ TensorShape out_shape = input->tensor_shape();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
- ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+ // Convert negative axis
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
}
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+ if(output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if(keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
return Status{};
}
@@ -62,22 +83,32 @@
_reduced_outs = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
_keep_dims = keep_dims;
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+ // Convert negative axis
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
// Perform reduction for every axis
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
- out_shape.set(reduction_axis[i], 1);
+ out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
if(i == _reduction_ops - 1 && keep_dims)
{
- _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
}
else
{
- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type()));
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
_memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -91,9 +122,13 @@
if(!keep_dims)
{
TensorShape out_shape = input->info()->tensor_shape();
+
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
- out_shape.remove_dimension(reduction_axis[i]);
+ out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 188c2bb..9f81a40 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,16 +50,6 @@
ARM_COMPUTE_ERROR("Unsupported reduction axis");
}
}
-BorderMode reduction_operation_border_mode(ReductionOperation op)
-{
- switch(op)
- {
- case ReductionOperation::SUM_SQUARE:
- return BorderMode::CONSTANT;
- default:
- return BorderMode::CONSTANT;
- }
-}
} // namespace
NEReductionOperation::NEReductionOperation()
@@ -86,9 +76,9 @@
if(axis == 0)
{
// Configure fill border kernel
- BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
- BorderMode fill_border_mode = reduction_operation_border_mode(op);
- _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
+ const BorderSize fill_border_size = _reduction_kernel.border_size();
+ const PixelValue pixelValue = (op == ReductionOperation::PROD) ? PixelValue(1, input->info()->data_type(), input->info()->quantization_info()) : PixelValue(0, input->info()->data_type());
+ _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
}
}
diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
new file mode 100644
index 0000000..139bd50
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReverse.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+
+#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEReverseKernel>();
+ k->configure(input, output, axis);
+ _kernel = std::move(k);
+}
+
+Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+ return NEReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index a9c85bd..483aa4c 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,6 +46,11 @@
{
ARM_COMPUTE_ERROR_ON(nullptr == offsets);
ARM_COMPUTE_UNUSED(sampling_policy);
+ float sampling_offset = 0.0f;
+ if(sampling_policy == SamplingPolicy::CENTER)
+ {
+ sampling_offset = 0.5f;
+ }
Window win;
win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
@@ -60,8 +65,8 @@
execute_window_loop(win, [&](const Coordinates & id)
{
- const float in_x = (id.x() + 0.5f) * wr - 0.5f;
- const float in_y = (id.y() + 0.5f) * hr - 0.5f;
+ const float in_x = (id.x() + sampling_offset) * wr - sampling_offset;
+ const float in_y = (id.y() + sampling_offset) * hr - sampling_offset;
const int in_xi = std::floor(in_x);
const int in_yi = std::floor(in_y);
@@ -167,14 +172,14 @@
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
- _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
}
Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+ ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
ITensorInfo *offsets = nullptr;
diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
new file mode 100644
index 0000000..509bbaa
--- /dev/null
+++ b/src/runtime/NEON/functions/NESelect.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESelect.h"
+
+#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
+#include "arm_compute/core/Types.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void NESelect::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NESelectKernel>();
+ k->configure(c, x, y, output);
+ _kernel = std::move(k);
+}
+
+Status NESelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+ return NESelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
new file mode 100644
index 0000000..03c2053
--- /dev/null
+++ b/src/runtime/NEON/functions/NESlice.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESlice.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ // Get absolute end coordinates
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
+
+ auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+ k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+ _kernel = std::move(k);
+}
+
+Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+
+ // Check start dimensions for being non-negative
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
+ {
+ return i < 0;
+ }));
+
+ // Get absolute end coordinates
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
+
+ return NEStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 9be9e68..36b7d47 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,54 +25,155 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "utils/TypePrinter.h"
#include <cfloat>
-using namespace arm_compute;
-
-NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp()
+namespace arm_compute
{
+NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
+ _output_flattened(), _needs_flattening(false)
+{
+}
+
+void NESoftmaxLayer::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
+{
+ // Flatten the input
+ const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
+
+ // Initialize the flat input
+ _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+
+ // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel
+ // If flattening on the third axes, we use NEFlattenKernel.
+ // In all other cases we have to use NEReshapeKernel
+ if(axis != 3)
+ {
+ auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayerKernel>();
+ reshape_kernel_ptr->configure(input, &_input_flattened);
+ _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr);
+ }
+ else
+ {
+ auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayerKernel>();
+ flatten_kernel_ptr->configure(input, &_input_flattened);
+ _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr);
+ }
+
+ // We need to init the output tensor here. Indeed, the reshape kernel expects
+ // both tensors to be already initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
}
void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
{
+ // Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_UNUSED(axis);
+ ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayer::validate(input->info(), output->info(), beta, axis));
- // Configure Kernels
- _max_kernel.configure(input, &_max);
- _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
- _softmax_kernel.configure(input, &_max, output, beta, &_tmp);
+ // We don't need flattening only in the case the input is 2D and axis is 1
+ _needs_flattening = axis != 1;
+
+ // If we are dealing with a 4D tensor, we will:
+ // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
+ // - Execute all the pipeline (reduction + normalization) on the flattened tensor
+ // - Reshape the flattened output into the real output
+ if(_needs_flattening)
+ {
+ // Add to the memory manager _input_flattened
+ _memory_group.manage(&_input_flattened);
+
+ // Configure _flatten_kernel and _input_flattened
+ configure_reshape_input_kernel(input, output, axis);
+ }
+
+ // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+ // or it is the original input case (2D case)
+ ITensor *input_2D = (_needs_flattening ? &_input_flattened : input);
+
+ // Create intermediate tensors shapes
+ const TensorInfo input_info = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
+ DataType tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::F32 : input_2D->info()->data_type();
+ TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
// Init intermediate tensors
- _max.allocator()->init(*_max.info());
- _tmp.allocator()->init(*_tmp.info());
+ TensorShape max_sum_shape = input_2D->info()->tensor_shape();
+ max_sum_shape.set(0, 1);
+ _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
+ _tmp.allocator()->init(tensor_info_tmp);
// Manage intermediate buffers
_memory_group.manage(&_max);
_memory_group.manage(&_tmp);
- // Allocate intermediate tensors
+ // Configure Kernels
+ _max_kernel.configure(input_2D, &_max);
+ if(_needs_flattening)
+ {
+ // Add to the memory manager _output_flattened
+ _memory_group.manage(&_output_flattened);
+
+ // The normalization kernel stores the result in a flat output tensor
+ _softmax_kernel.configure(input_2D, &_max, &_output_flattened, beta, &_tmp);
+ _input_flattened.allocator()->allocate();
+
+ // Reshape the flat output into the requested (4D) output
+ _reshape_kernel.configure(&_output_flattened, output);
+
+ // Allocate the intermediate flat tensors
+ _output_flattened.allocator()->allocate();
+ }
+ else
+ {
+ // Softmax 2D case
+ _fill_border_kernel.configure(input_2D, _max_kernel.border_size(), BorderMode::REPLICATE);
+ _softmax_kernel.configure(input_2D, &_max, output, beta, &_tmp);
+ }
+
+ // Allocate intermediate buffers
_max.allocator()->allocate();
_tmp.allocator()->allocate();
}
Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis must be 1 for NEON");
-
// Perform validation step
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < 1 || input->num_dimensions() < axis);
- const TensorShape max_shape = TensorShape(input->tensor_shape()).set(0, 1);
- const TensorInfo tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding();
- const TensorInfo dont_care;
+ // Create intermediate tensor info
+ DataType tmp_data_type = input->data_type();
+ const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
+
+ TensorShape max_sum_shape = input->tensor_shape();
+ max_sum_shape.set(0, 1);
+ const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true));
+ const TensorInfo dont_care;
+
+ const bool needs_flattening = (axis != 1);
+
+ if(needs_flattening)
+ {
+ const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
+ TensorInfo tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+
+ if(axis != 3)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat));
+ }
+ }
ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
- ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(input, &tensor_info_max_sum, output, beta, &dont_care));
+ ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
return Status{};
}
@@ -81,9 +182,20 @@
{
_memory_group.acquire();
+ if(_needs_flattening)
+ {
+ NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY);
+ }
+
NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
NEScheduler::get().schedule(&_max_kernel, Window::DimY);
NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
+ if(_needs_flattening)
+ {
+ NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+ }
+
_memory_group.release();
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
new file mode 100644
index 0000000..e947657
--- /dev/null
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESplit.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+NESplit::NESplit()
+ : _outputs_vector(), _slice_functions(), _num_outputs(0)
+{
+}
+
+void NESplit::configure(const ITensor *input, const std::vector<ITensor *> &outputs, unsigned int axis)
+{
+ // Create Slice functions
+ _num_outputs = outputs.size();
+ _slice_functions = arm_compute::support::cpp14::make_unique<NESlice[]>(_num_outputs);
+
+ // Get output shape
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
+
+ // Extract output tensor info
+ std::vector<ITensorInfo *> outputs_info;
+ for(auto &output : outputs)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ outputs_info.emplace_back(output->info());
+ }
+
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(NESplit::validate(input->info(), outputs_info, axis));
+
+ const size_t axis_split_step = output_shape[axis];
+ unsigned int axis_offset = 0;
+
+ // Start/End coordinates
+ Coordinates start_coords;
+ Coordinates end_coords;
+ for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+ {
+ end_coords.set(d, -1);
+ }
+
+ for(unsigned int i = 0; i < _num_outputs; i++)
+ {
+ // Update coordinate on axis
+ start_coords.set(axis, axis_offset);
+ end_coords.set(axis, axis_offset + axis_split_step);
+
+ // Configure slice function
+ _slice_functions[i].configure(input, outputs[i], start_coords, end_coords);
+
+ // Set valid region from shape
+ outputs[i]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
+
+ // Update axis offset
+ axis_offset += axis_split_step;
+ }
+}
+
+Status NESplit::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs, unsigned int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis >= input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
+
+ // Get output shape
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input, axis, outputs.size());
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
+
+ const size_t axis_split_step = output_shape[axis];
+ unsigned int axis_offset = 0;
+
+ // Start/End coordinates
+ Coordinates start_coords;
+ Coordinates end_coords;
+ for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+ {
+ end_coords.set(d, -1);
+ }
+
+ // Validate output tensors
+ for(const auto &output : outputs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ // Output auto inizialitation if not yet initialized
+ TensorInfo tmp_output_info = *output->clone();
+ auto_init_if_empty(tmp_output_info, input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+ // Update coordinate on axis
+ start_coords.set(axis, axis_offset);
+ end_coords.set(axis, axis_offset + axis_split_step);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(input, output, start_coords, end_coords));
+ axis_offset += axis_split_step;
+ }
+
+ return Status{};
+}
+
+void NESplit::run()
+{
+ for(unsigned i = 0; i < _num_outputs; ++i)
+ {
+ _slice_functions[i].run();
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
new file mode 100644
index 0000000..2f49c22
--- /dev/null
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEStackLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "support/ToolchainSupport.h"
+namespace arm_compute
+{
+NEStackLayer::NEStackLayer() // NOLINT
+ : _input(),
+ _stack_kernels(),
+ _num_inputs(0)
+{
+}
+
+void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
+{
+ _num_inputs = input.size();
+ _stack_kernels = arm_compute::support::cpp14::make_unique<NEStackLayerKernel[]>(_num_inputs);
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+ }
+}
+
+Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+ // Wrap around negative values
+ const size_t rank = input[0]->num_dimensions();
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+ const unsigned int num_inputs = input.size();
+
+ for(unsigned int i = 0; i < num_inputs; i++)
+ {
+ // All the tensors must have the same rank
+ ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+ // Validate Kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+ }
+
+ return Status{};
+}
+
+void NEStackLayer::run()
+{
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ NEScheduler::get().schedule(&_stack_kernels[i], Window::DimY);
+ }
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
new file mode 100644
index 0000000..53eb2b0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
+
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEStridedSlice::configure(const ITensor *input, ITensor *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+ k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ _kernel = std::move(k);
+}
+
+Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
new file mode 100644
index 0000000..0ca4413
--- /dev/null
+++ b/src/runtime/NEON/functions/NETile.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETile.h"
+
+#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NETile::configure(const ITensor *input, ITensor *output, const Multiples &multiples)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NETileKernel>();
+ k->configure(input, output, multiples);
+ _kernel = std::move(k);
+}
+
+Status NETile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+ return NETileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
new file mode 100644
index 0000000..7532020
--- /dev/null
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEUnstack.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+ return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+ // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+ Coordinates slice_end;
+ slice_start.set_num_dimensions(input_num_dimensions);
+ slice_end.set_num_dimensions(input_num_dimensions);
+ for(size_t k = 0; k < input_num_dimensions; ++k)
+ {
+ slice_start.set(k, 0);
+ slice_end.set(k, -1);
+ }
+ slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+NEUnstack::NEUnstack() // NOLINT
+ : _num_slices(0),
+ _strided_slice_vector()
+{
+}
+
+void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
+{
+ std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+ std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_axis(axis, input->info());
+ _num_slices = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+ _strided_slice_vector = arm_compute::support::cpp14::make_unique<NEStridedSlice[]>(_num_slices);
+
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+ for(unsigned int slice = 0; slice < _num_slices; ++slice)
+ {
+ // Adjusts start and end coordinates to take a 2D slice at a time
+ slice_start.set(axis_u, slice);
+ _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ }
+}
+
+Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+
+ const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ for(size_t k = 0; k < num_slices; ++k)
+ {
+ slice_start.set(wrap_axis(axis, input), k);
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+ ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+ }
+ return Status{};
+}
+
+void NEUnstack::run()
+{
+ for(unsigned i = 0; i < _num_slices; ++i)
+ {
+ _strided_slice_vector[i].run();
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
index 097605c..7e435c3 100644
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,7 +66,7 @@
_num_inputs = inputs_vector.size();
std::vector<ITensorInfo *> inputs_vector_info;
- for(unsigned int i = 0; i < _num_inputs; i++)
+ for(unsigned int i = 0; i < _num_inputs; ++i)
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
@@ -80,7 +80,7 @@
_concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
- for(unsigned int i = 0; i < _num_inputs; i++)
+ for(unsigned int i = 0; i < _num_inputs; ++i)
{
_concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
width_offset += inputs_vector.at(i)->info()->dimension(0);
@@ -89,7 +89,7 @@
void NEWidthConcatenateLayer::run()
{
- for(unsigned i = 0; i < _num_inputs; i++)
+ for(unsigned i = 0; i < _num_inputs; ++i)
{
NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index c8e3b3b..e37f8ab 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -464,6 +464,7 @@
transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
//The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+ _memory_group.manage(&_output_nhwc);
transform_output_kernel->configure(biases, &_output_workspace,
output_matrix_stride, &_output_nhwc,
in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
@@ -483,16 +484,16 @@
in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
}
- _weights_hwio.allocator()->allocate();
_gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
_input_workspace.allocator()->allocate();
- _kernel_storage.allocator()->allocate();
_output_workspace.allocator()->allocate();
// Reorder the convoluted output to ACL's ordering NCHW
- _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
-
- _output_nhwc.allocator()->allocate();
+ if(data_layout == DataLayout::NCHW)
+ {
+ _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+ _output_nhwc.allocator()->allocate();
+ }
_transform_input_kernel = std::move(transform_input_kernel);
_transform_weights_kernel = std::move(transform_weights_kernel);
@@ -656,10 +657,12 @@
if(!_is_prepared)
{
// Permute weights
+ _weights_hwio.allocator()->allocate();
_permute_weights.run();
_weights->mark_as_unused();
// Transform weights
+ _kernel_storage.allocator()->allocate();
NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
_weights_hwio.allocator()->free();
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index c87e82a..34aaea0 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,18 +26,159 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
namespace arm_compute
{
+#ifndef NO_MULTI_THREADING
+class BufferManagerMultipleThreads final : public IBufferManager
+{
+public:
+ /** Number of buffers to ping pong between */
+ static constexpr unsigned int NUM_BUFFERS = 3;
+
+ explicit BufferManagerMultipleThreads(unsigned int max_num_users)
+ : _max_num_users(max_num_users)
+ {
+ }
+ unsigned int num_buffers() const override
+ {
+ return NUM_BUFFERS;
+ }
+ /* - Lock the requested index if it's free and return true if it needs reshaping.
+ * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped.
+ * - Block if the corresponding buffer for the given index is still being used by a different index.
+ */
+ bool lock_to_reshape_if_needed(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ while(true)
+ {
+ if(buf.index == index && buf.state != State::FREE)
+ {
+ //Another thread already is reshaping / has reshaped this block: nothing to do
+ return false;
+ }
+ else
+ {
+ std::unique_lock<std::mutex> lock(buf.mutex);
+ //If the buffer is free then lock it for reshaping:
+ if(buf.state == State::FREE)
+ {
+ buf.index = index;
+ buf.state = State::BEING_RESHAPED;
+ return true;
+ }
+ // Check again just in case it changed while we were acquiring the lock:
+ if(buf.index == index)
+ {
+ //Another thread is reshaping this block already, nothing to do
+ return false;
+ }
+ // buf.index != index: Buffer still being used by another block, need to wait
+ buf.sem.wait(lock);
+ }
+ }
+ }
+ /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */
+ void mark_as_reshaped(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ {
+ std::lock_guard<std::mutex> lock(buf.mutex);
+ buf.users = _max_num_users;
+ buf.state = State::IN_USE;
+ }
+ buf.sem.notify_all();
+ }
+
+ /* Block until the buffer at the given index is reshaped */
+ void wait_for_reshaping(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+ // Check if it's already ready to use:
+ if(buf.state == State::IN_USE)
+ return;
+ std::unique_lock<std::mutex> lock(buf.mutex);
+ //Double check it didn't change while we were acquiring the lock:
+ if(buf.state == State::IN_USE)
+ return;
+ buf.sem.wait(lock);
+ }
+ /* Mark the buffer at the given index as not used by this thread anymore.
+ * Once all the threads have called this method then the buffer is marked as free again.
+ */
+ void mark_as_unused(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+ if(--buf.users == 0)
+ {
+ std::unique_lock<std::mutex> lock(buf.mutex);
+ buf.state = State::FREE;
+ lock.unlock();
+ buf.sem.notify_all();
+ }
+ }
+
+private:
+ enum class State
+ {
+ FREE,
+ BEING_RESHAPED,
+ IN_USE
+ };
+ struct Buffer
+ {
+ unsigned int index{};
+ std::atomic_uint users{};
+ State state{ State::FREE };
+ std::mutex mutex{};
+ std::condition_variable sem{};
+ } _buffers[NUM_BUFFERS];
+ Buffer &get_buffer_from_index(unsigned int index)
+ {
+ return _buffers[index % NUM_BUFFERS];
+ }
+ unsigned int _max_num_users;
+};
+#endif /* NO_MULTI_THREADING */
+
+class BufferManagerSingleThread : public IBufferManager
+{
+public:
+ unsigned int num_buffers() const override
+ {
+ return 1;
+ }
+ bool lock_to_reshape_if_needed(unsigned int index) override
+ {
+ return true;
+ }
+ void mark_as_reshaped(unsigned int index) override
+ {
+ }
+ void wait_for_reshaping(unsigned int index) override
+ {
+ }
+ void mark_as_unused(unsigned int index) override
+ {
+ }
+};
+
NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager))
{
}
+
void NEGEMMInterleavedWrapper::run()
{
prepare();
@@ -53,6 +194,7 @@
{
if(_pretranspose_b)
{
+ _transformed_b.allocator()->allocate();
NEScheduler::get().schedule(_prepare_b.get(), Window::DimX);
_b->mark_as_unused();
}
@@ -65,12 +207,13 @@
//Maximum number of workloads to create:
const unsigned int num_threads = NEScheduler::get().num_threads();
- const unsigned int max_iterations = num_threads == 1 ? 1 : num_threads;
+ const unsigned int max_iterations = std::max(num_threads, _num_windows);
//Maximum number of iterations the parameters allow:
const unsigned int num_iterations = _batch_window.num_iterations_total();
// Keep the smallest of the two:
const unsigned int num_windows = std::min(num_iterations, max_iterations);
const TensorShape window_shape = _batch_window.shape();
+ const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
// Create a 1D window to dynamically split the batch window:
Window win_1D;
@@ -79,66 +222,119 @@
// Create one workload for each sub-window:
for(unsigned int w = 0; w < num_windows; w++)
{
- Window win = win_1D.split_window(0, w, num_windows);
- const Coordinates start_offset = index2coords(window_shape, win.x().start());
- const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1);
- const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
+ Window win = win_1D.split_window(0, w, num_windows);
+ const Coordinates start_offset = index2coords(window_shape, win.x().start());
+ const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1);
- auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+ if(_pretranspose_b)
{
- //For each block of rows in "M"
- auto workload_mm = this->_mm_workloads.begin();
- for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+ auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
{
- // Transform one k_block from A:
- this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
- // Then perform the matrix multiplication for each x block along N:
- for(unsigned int i = 0; i < num_x_blocks; i++)
+ //For each block of rows in "M"
+ auto workload_mm = this->_mm_workloads.begin();
+ for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
{
- ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
- this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ // Transform one k_block from A:
+ this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+ // Then perform the matrix multiplication for each x block along N:
+ for(unsigned int i = 0; i < num_x_blocks; i++)
+ {
+ ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+ this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ }
}
- }
- };
- _workloads.push_back(workload);
+ };
+ _workloads.push_back(workload);
+ }
+ else
+ {
+ auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+ {
+ //For each block of rows in "M"
+ auto workload_mm = this->_mm_workloads.begin();
+ unsigned int workload_b = 0;
+ //If there is only one thread then only reshape the B blocks as you need them:
+ unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
+
+ for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+ {
+ // Transform one k_block from A:
+ this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+ // Then perform the matrix multiplication for each x block along N:
+ for(unsigned int i = 0; i < num_x_blocks; i++)
+ {
+ ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+ if(workload_b_next < this->_b_workloads.size())
+ {
+ //Lock on BufferManager: need to run it ?
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b_next);
+ }
+ workload_b_next++;
+ }
+ ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+ // Run if needed or wait
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b);
+ }
+ this->_buffer_manager->wait_for_reshaping(workload_b);
+ this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ this->_buffer_manager->mark_as_unused(workload_b);
+ workload_b++;
+ }
+ }
+ };
+ _workloads.push_back(workload);
+ }
+ }
+ if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
+ {
+ //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks:
+ for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++)
+ {
+ auto workload = [this](const ThreadInfo & info)
+ {
+ unsigned int workload_b = 0;
+ //If there is only one thread then only reshape the B blocks as you need them:
+ unsigned int workload_b_next = 1;
+
+ for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++)
+ {
+ if(workload_b_next < this->_b_workloads.size())
+ {
+ //Lock on BufferManager: need to run it ?
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b_next);
+ }
+ workload_b_next++;
+ }
+ ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+ // Run if needed or wait
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b);
+ }
+ this->_buffer_manager->wait_for_reshaping(workload_b);
+ this->_buffer_manager->mark_as_unused(workload_b);
+ workload_b++;
+ }
+ };
+ _workloads.push_back(workload);
+ }
}
_is_prepared = true;
}
}
-namespace
-{
-// Factory to instantiate NEGEMMInterleavedPrepareBWrapperKernel:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b, ITensor *transformed_b, const INEGEMMWrapperKernel::Params ¶ms)
-{
- auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<InputType, use_dot>>();
- prepare_b->configure(b, transformed_b, false, NEScheduler::get().cpu_info(), params);
- return std::move(prepare_b);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a, ITensor *transformed_a, const Window &block_walker, const INEGEMMWrapperKernel::Params ¶ms)
-{
- auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<InputType, use_dot>>();
- transform_a->configure(a, transformed_a, false, block_walker, params);
- return std::move(transform_a);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, typename OutputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
- const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params ¶ms, bool pretranspose_b, float alpha, float beta)
-{
- auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<InputType, OutputType, use_dot>>();
- matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, NEScheduler::get().num_threads());
- return std::move(matrix_multiply);
-}
-} // namespace
-
-void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot)
+void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b)
{
_params = INEGEMMWrapperKernel::extract_parameters(a, b, c);
_a = a;
@@ -146,124 +342,80 @@
_c = c;
_pretranspose_b = pretranspose_b;
- DataType input_type = a->info()->data_type();
+ const DataType input_type = a->info()->data_type();
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+
+ const arm_gemm::KernelDescription gemm_kernel_info = get_gemm_info(input_type, ci, num_threads, _params, alpha, beta, pretranspose_b);
+ ARM_COMPUTE_ERROR_ON(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMM_INTERLEAVED);
// Forcing 128-byte alignment (required by 32-bit kernels)
const unsigned int alignment = 128;
_transformed_b.allocator()->init(TensorInfo{}, alignment);
_tmp_c.allocator()->init(TensorInfo{}, alignment);
- _tag = "NEGEMMInterleaved_";
- _tag += get_strategy_name(input_type, use_dot);
+ _tag = "NEGEMMInterleaved_" + gemm_kernel_info.name;
+
+ // Get strategy
+ std::unique_ptr<detail::IInterleavedStrategy> strategy = detail::create_strategy(gemm_kernel_info.name);
+ _num_windows = iceildiv(_params.M, strategy->out_height()) * _params.batches;
+ ARM_COMPUTE_ERROR_ON(strategy == nullptr);
if(!_pretranspose_b)
{
+ _block_sizes = strategy->calculate_block_sizes_for_strategy(ci, _params);
+ _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+ _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
+ // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
+ const unsigned int num_iterations = _batch_window.num_iterations_total();
+ if(NEScheduler::get().num_threads() == 1 || num_iterations == 1)
+ {
+ _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>();
+ }
+ else
+ {
+#ifdef NO_MULTI_THREADING
+ ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads");
+#else /* NO_MULTI_THREADING */
+ _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads());
+#endif /* NO_MULTI_THREADING */
+ }
// If B is transposed at every iteration then transformed_B can be managed:
_memory_group.manage(&_transformed_b);
- _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
+ auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers())));
}
else
{
_tag += "_preB";
- switch(input_type)
- {
- case DataType::F32:
- _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
- break;
-#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
- }
- break;
- case DataType::S8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
- }
- break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
- }
- ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+ }
+ _prepare_b = strategy->instantiate_prepareB(b, &_transformed_b, _params, ci);
+ ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+
+ if(_pretranspose_b)
+ {
_block_sizes = _prepare_b->block_sizes();
+ _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+ _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
}
_block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
_block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
_block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
- _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
- _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-
_transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
_memory_group.manage(&_transformed_a);
_memory_group.manage(&_tmp_c);
- switch(input_type)
- {
- case DataType::F32:
- _transform_a = instantiate_transformA<float>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<float, float>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- break;
-#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- _transform_a = instantiate_transformA<uint8_t, true>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- else
- {
- _transform_a = instantiate_transformA<uint8_t, false>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- break;
- case DataType::S8:
- if(use_dot)
- {
- _transform_a = instantiate_transformA<int8_t, true>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- else
- {
- _transform_a = instantiate_transformA<int8_t, false>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _transform_a = instantiate_transformA<__fp16>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<__fp16, __fp16>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- break;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- break;
- }
+ _transform_a = strategy->instantiate_transformA(_a, &_transformed_a, _block_walker, _params);
+ _matrix_multiply = strategy->instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, alpha, beta, pretranspose_b, num_threads);
ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
+
_transformed_a.allocator()->allocate();
_tmp_c.allocator()->allocate();
- _transformed_b.allocator()->allocate();
+ if(!_pretranspose_b)
+ {
+ _transformed_b.allocator()->allocate();
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index d0b3bde..ad23220 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,8 +34,16 @@
#include <map>
#include <vector>
-using namespace arm_compute;
-
+namespace arm_compute
+{
+namespace
+{
+size_t align_offset(size_t offset, size_t alignment)
+{
+ const size_t remainder = (alignment != 0U) ? offset % alignment : 0U;
+ return (remainder != 0U) ? offset + (alignment - remainder) : offset;
+}
+} // namespace
OffsetLifetimeManager::OffsetLifetimeManager()
: _blob(0)
{
@@ -58,11 +66,15 @@
ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
// Update blob size
- size_t max_group_size = std::accumulate(std::begin(_free_blobs), std::end(_free_blobs), static_cast<size_t>(0), [](size_t s, const Blob & b)
+ size_t max_aggregated_size = 0;
+ std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b)
{
- return s + b.max_size;
+ max_aggregated_size += b.max_size;
+ _blob.alignment = std::max(_blob.alignment, b.max_alignment);
});
- _blob = std::max(_blob, max_group_size);
+ max_aggregated_size += _free_blobs.size() * _blob.alignment;
+ _blob.owners = std::max(_blob.owners, _free_blobs.size());
+ _blob.size = std::max(_blob.size, max_aggregated_size);
// Calculate group mappings
auto &group_mappings = _active_group->mappings();
@@ -76,6 +88,8 @@
group_mappings[bound_element.handle] = offset;
}
offset += free_blob.max_size;
- ARM_COMPUTE_ERROR_ON(offset > _blob);
+ offset = align_offset(offset, _blob.alignment);
+ ARM_COMPUTE_ERROR_ON(offset > _blob.size);
}
}
+} // namespace arm_compute
diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index 36eaf0b..70cbe90 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,11 +34,11 @@
using namespace arm_compute;
-OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, size_t blob_size)
- : _allocator(allocator), _blob(), _blob_size(blob_size)
+OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info)
+ : _allocator(allocator), _blob(), _blob_info(blob_info)
{
ARM_COMPUTE_ERROR_ON(!allocator);
- _blob = _allocator->make_region(blob_size, 0);
+ _blob = _allocator->make_region(blob_info.size, blob_info.alignment);
}
void OffsetMemoryPool::acquire(MemoryMappings &handles)
@@ -49,7 +49,7 @@
for(auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
- handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_size - handle.second));
+ handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second));
}
}
@@ -70,5 +70,5 @@
std::unique_ptr<IMemoryPool> OffsetMemoryPool::duplicate()
{
ARM_COMPUTE_ERROR_ON(!_allocator);
- return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_size);
+ return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_info);
}
\ No newline at end of file
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 5fa51d7..38edb8b 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -138,7 +138,7 @@
}
else
{
- _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
+ _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment());
}
info().set_is_resizable(false);
}