arm_compute v19.05
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index 74af99b..cfb36e1 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,10 +47,10 @@
{
if(_info == nullptr)
{
- return ValidRegion();
+ return ValidRegion{};
}
- return ValidRegion(Coordinates(), _info->tensor_shape());
+ return ValidRegion{ Coordinates(), _info->tensor_shape() };
}
void AccessWindowAutoPadding::set_valid_region()
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 18ef185..f4ceca8 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -214,7 +214,10 @@
WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
- WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))
+ WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5)),
+ WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 7)),
+ WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(7, 1)),
+ WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7)),
};
auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
@@ -255,4 +258,11 @@
return 1;
}
}
+
+bool preferred_dummy_work_items_support(const cl::Device &device)
+{
+ ARM_COMPUTE_UNUSED(device);
+ // TODO (COMPMID-2044)
+ return true;
+}
} // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 4ecb885..df60001 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -190,6 +190,7 @@
{ "compare_lessequal_quantized", "comparisons.cl" },
{ "concatenate_depth", "concatenate.cl" },
{ "concatenate_width", "concatenate.cl" },
+ { "concatenate_height", "concatenate.cl" },
{ "concatenate_width_x2", "concatenate.cl" },
{ "concatenate_width_x4", "concatenate.cl" },
{ "convolution_rectangle", "convolution_rectangle.cl" },
@@ -212,16 +213,18 @@
{ "copy_plane", "channel_extract.cl" },
{ "copy_planes_3p", "channel_combine.cl" },
{ "copy_to_keypoint", "fast_corners.cl" },
+ { "crop_tensor", "crop_tensor.cl" },
+ { "deconvolution_reshape", "deconvolution_layer.cl" },
{ "deconvolution_upsample", "deconvolution_layer.cl" },
{ "depthwise_convolution_3x3", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_nhwc", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_nhwc_stride1", "depthwise_convolution.cl" },
- { "depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl" },
- { "depthwise_convolution_3x3_quantized_nhwc", "depthwise_convolution_quantized.cl" },
- { "depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl" },
- { "depthwise_convolution_3x3_quantized_dot8_nchw", "depthwise_convolution_quantized.cl" },
- { "depthwise_convolution_3x3_quantized_dot8_nhwc_stride1", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_native_qasymm8_nchw", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_native_qasymm8_dot8_nchw", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_reshaped_qasymm8_nhwc", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_reshaped_qasymm8_stride1_nhwc", "depthwise_convolution_quantized.cl" },
+ { "dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc", "depthwise_convolution_quantized.cl" },
{ "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl" },
{ "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl" },
@@ -258,12 +261,39 @@
{ "elementwise_unary", "elementwise_unary.cl" },
{ "erode", "erode.cl" },
{ "fast_corners", "fast_corners.cl" },
- { "flatten", "flatten.cl" },
+ { "fft_digit_reverse_axis_0", "fft_digit_reverse.cl" },
+ { "fft_digit_reverse_axis_1", "fft_digit_reverse.cl" },
+ { "fft_radix_2_first_stage_axis_0", "fft.cl" },
+ { "fft_radix_2_first_stage_axis_1", "fft.cl" },
+ { "fft_radix_2_axis_0", "fft.cl" },
+ { "fft_radix_2_axis_1", "fft.cl" },
+ { "fft_radix_3_first_stage_axis_0", "fft.cl" },
+ { "fft_radix_3_first_stage_axis_1", "fft.cl" },
+ { "fft_radix_3_axis_0", "fft.cl" },
+ { "fft_radix_3_axis_1", "fft.cl" },
+ { "fft_radix_4_first_stage_axis_0", "fft.cl" },
+ { "fft_radix_4_first_stage_axis_1", "fft.cl" },
+ { "fft_radix_4_axis_0", "fft.cl" },
+ { "fft_radix_4_axis_1", "fft.cl" },
+ { "fft_radix_5_first_stage_axis_0", "fft.cl" },
+ { "fft_radix_5_first_stage_axis_1", "fft.cl" },
+ { "fft_radix_5_axis_0", "fft.cl" },
+ { "fft_radix_5_axis_1", "fft.cl" },
+ { "fft_radix_7_first_stage_axis_0", "fft.cl" },
+ { "fft_radix_7_first_stage_axis_1", "fft.cl" },
+ { "fft_radix_7_axis_0", "fft.cl" },
+ { "fft_radix_7_axis_1", "fft.cl" },
+ { "fft_radix_8_first_stage_axis_0", "fft.cl" },
+ { "fft_radix_8_first_stage_axis_1", "fft.cl" },
+ { "fft_radix_8_axis_0", "fft.cl" },
+ { "fft_radix_8_axis_1", "fft.cl" },
+ { "fft_scale_conj", "fft_scale.cl" },
{ "fill_image_borders_constant", "fill_border.cl" },
{ "fill_image_borders_replicate", "fill_border.cl" },
{ "finalize", "optical_flow_pyramid_lk.cl" },
- { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
+ { "flatten", "flatten.cl" },
{ "floor_layer", "floor.cl" },
+ { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
{ "gather", "gather.cl" },
{ "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
{ "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
@@ -284,6 +314,8 @@
{ "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
{ "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
{ "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" },
+ { "gemm_mm_reshaped_only_rhs_nt", "gemm.cl" },
+ { "gemm_mm_reshaped_only_rhs_t", "gemm.cl" },
{ "gemm_lc_vm_f32", "gemm.cl" },
{ "gemm_transpose1xW", "gemm.cl" },
{ "gemm_reshape_lhs_matrix_nt", "gemm.cl" },
@@ -301,6 +333,7 @@
{ "gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl" },
{ "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "gemmlowp.cl" },
{ "gemmlowp_mm_reshaped_lhs_nt_rhs_t_dot8", "gemmlowp.cl" },
+ { "gemmlowp_mm_reshaped_only_rhs_t", "gemmlowp.cl" },
{ "gemmlowp_offset_contribution", "gemmlowp.cl" },
{ "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" },
{ "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" },
@@ -373,6 +406,7 @@
{ "NV21_to_YUV444_bt709", "color_convert.cl" },
{ "output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
{ "permute", "permute.cl" },
+ { "pixelwise_mul_complex", "pixelwise_mul_float.cl" },
{ "pixelwise_mul_float", "pixelwise_mul_float.cl" },
{ "pixelwise_mul_int", "pixelwise_mul_int.cl" },
{ "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
@@ -470,6 +504,9 @@
{ "winograd_filter_transform_4x4_5x5_nhwc", "winograd_filter_transform.cl" },
{ "winograd_filter_transform_4x1_5x1_nhwc", "winograd_filter_transform.cl" },
{ "winograd_filter_transform_1x4_1x5_nhwc", "winograd_filter_transform.cl" },
+ { "winograd_filter_transform_2x2_7x7_nhwc", "winograd_filter_transform.cl" },
+ { "winograd_filter_transform_2x1_7x1_nhwc", "winograd_filter_transform.cl" },
+ { "winograd_filter_transform_1x2_1x7_nhwc", "winograd_filter_transform.cl" },
{ "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd_input_transform.cl" },
{ "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd_input_transform.cl" },
{ "winograd_input_transform_2x1_3x1_stepz1_nchw", "winograd_input_transform.cl" },
@@ -488,6 +525,9 @@
{ "winograd_input_transform_4x4_5x5_stepz1_nhwc", "winograd_input_transform.cl" },
{ "winograd_input_transform_4x1_5x1_stepz1_nhwc", "winograd_input_transform.cl" },
{ "winograd_input_transform_1x4_1x5_stepz1_nhwc", "winograd_input_transform.cl" },
+ { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "winograd_input_transform.cl" },
+ { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "winograd_input_transform.cl" },
+ { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "winograd_input_transform.cl" },
{ "winograd_output_transform_2x2_3x3_nchw", "winograd_output_transform.cl" },
{ "winograd_output_transform_2x1_3x1_nchw", "winograd_output_transform.cl" },
{ "winograd_output_transform_1x2_1x3_nchw", "winograd_output_transform.cl" },
@@ -503,6 +543,9 @@
{ "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" },
{ "winograd_output_transform_4x1_5x1_nhwc", "winograd_output_transform.cl" },
{ "winograd_output_transform_1x4_1x5_nhwc", "winograd_output_transform.cl" },
+ { "winograd_output_transform_2x2_7x7_nhwc", "winograd_output_transform.cl" },
+ { "winograd_output_transform_2x1_7x1_nhwc", "winograd_output_transform.cl" },
+ { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" },
{ "yolo_layer_nchw", "yolo_layer.cl" },
{ "yolo_layer_nhwc", "yolo_layer.cl" },
{ "YUYV422_to_IYUV_bt709", "color_convert.cl" },
@@ -607,6 +650,10 @@
#include "./cl_kernels/copy_tensor.clembed"
},
{
+ "crop_tensor.cl",
+#include "./cl_kernels/crop_tensor.clembed"
+ },
+ {
"upsample_layer.cl",
#include "./cl_kernels/upsample_layer.clembed"
},
@@ -675,14 +722,26 @@
#include "./cl_kernels/fast_corners.clembed"
},
{
- "flatten.cl",
-#include "./cl_kernels/flatten.clembed"
+ "fft.cl",
+#include "./cl_kernels/fft.clembed"
+ },
+ {
+ "fft_digit_reverse.cl",
+#include "./cl_kernels/fft_digit_reverse.clembed"
+ },
+ {
+ "fft_scale.cl",
+#include "./cl_kernels/fft_scale.clembed"
},
{
"fill_border.cl",
#include "./cl_kernels/fill_border.clembed"
},
{
+ "flatten.cl",
+#include "./cl_kernels/flatten.clembed"
+ },
+ {
"floor.cl",
#include "./cl_kernels/floor.clembed"
},
@@ -1035,7 +1094,7 @@
return Kernel(kernel_name, cl_program);
}
-void CLKernelLibrary::add_built_program(const std::string &built_program_name, cl::Program program)
+void CLKernelLibrary::add_built_program(const std::string &built_program_name, const cl::Program &program)
{
_built_programs_map.emplace(built_program_name, program);
}
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 995fcb4..2d28a49 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,7 +36,7 @@
using namespace arm_compute;
-void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint)
+void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
{
if(kernel.kernel()() == nullptr)
{
@@ -58,6 +58,13 @@
return;
}
+ // Use dummy work-items
+ if(use_dummy_work_items)
+ {
+ gws.get()[0] = get_next_power_two(gws[0]);
+ gws.get()[1] = get_next_power_two(gws[1]);
+ }
+
cl::NDRange valid_lws;
if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
{
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 6725f36..ef03a5a 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -120,6 +120,9 @@
LOAD_FUNCTION_PTR(clEnqueueMarker, handle);
LOAD_FUNCTION_PTR(clWaitForEvents, handle);
+ // Third-party extensions
+ LOAD_FUNCTION_PTR(clImportMemoryARM, handle);
+
#undef LOAD_FUNCTION_PTR
//Don't call dlclose(handle) or all the symbols will be unloaded !
@@ -919,3 +922,27 @@
return CL_OUT_OF_RESOURCES;
}
}
+
+cl_mem
+clImportMemoryARM(cl_context context,
+ cl_mem_flags flags,
+ const cl_import_properties_arm *properties,
+ void *memory,
+ size_t size,
+ cl_int *errcode_ret)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr;
+ if(func != nullptr)
+ {
+ return func(context, flags, properties, memory, size, errcode_ret);
+ }
+ else
+ {
+ if(errcode_ret != nullptr)
+ {
+ *errcode_ret = CL_OUT_OF_RESOURCES;
+ }
+ return nullptr;
+ }
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index dfd16e0..60307bc 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -341,22 +341,10 @@
Vector bn_mean = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_mean);
Vector bn_var = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_var);
- // In-place ops
-#ifdef IN_PLACE_W
- Tensor4D fused_w = conv_w;
-#else /* IN_PLACE_W */
- Tensor4D fused_w = CONVERT_TO_TENSOR4D_STRUCT(fused_w, NUM_CHANNELS);
-#endif /* IN_PLACE */
-#ifdef IN_PLACE_B
- Vector fused_b = conv_b;
-#else /* IN_PLACE_W */
- Vector fused_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(fused_b);
-#endif /* IN_PLACE */
-
// Conditional ops
#ifdef HAS_BIAS
Vector conv_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(conv_b);
-#endif /* USE_DEFAULT_BETA */
+#endif /* HAS_BIAS */
#ifndef USE_DEFAULT_BETA
Vector bn_beta = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_beta);
#endif /* USE_DEFAULT_BETA */
@@ -364,6 +352,19 @@
Vector bn_gamma = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_gamma);
#endif /* USE_DEFAULT_GAMMA */
+ // In-place ops
+#ifdef IN_PLACE_W
+ Tensor4D fused_w = conv_w;
+ uint fused_w_stride_x = conv_w_stride_x;
+#else /* IN_PLACE_W */
+ Tensor4D fused_w = CONVERT_TO_TENSOR4D_STRUCT(fused_w, NUM_CHANNELS);
+#endif /* IN_PLACE_W */
+#ifdef IN_PLACE_B
+ Vector fused_b = conv_b;
+#else /* IN_PLACE_B */
+ Vector fused_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(fused_b);
+#endif /* IN_PLACE_B */
+
const int current_slice = get_global_id(2) / NUM_CHANNELS;
#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index c374769..e365683 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -132,10 +132,10 @@
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2)
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) */
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values = select(src2_values, src1_values, cond);
@@ -330,6 +330,59 @@
#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
+#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
+/** This kernel concatenates the input tensor into the output tensor along the second dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Vector sizes supported are 2,4,8 and 16.
+ * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+
+__kernel void concatenate_height(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
+ Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+ const VEC_UCHAR out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+ VSTORE(VEC_SIZE)
+ (out, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+ VSTORE(VEC_SIZE)
+ (source_values, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+
+#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
+
/** This kernel concatenates the input tensor into the output tensor along the third dimension
*
* @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
@@ -356,20 +409,19 @@
__kernel void concatenate_depth(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
- int3 offsets)
+ int offset)
{
Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, -offsets.x, -offsets.y, 0));
+ source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
source_values = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
VSTORE(VEC_SIZE)
- (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z));
-
+ (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offset));
}
#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
index 4bbbf11..f4366b8 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/copy_tensor.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -77,6 +77,7 @@
}
#endif // Compile time constants
+#if defined(DATA_TYPE)
/** Performs a copy of input tensor to the output tensor.
*
* @param[in] in_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -103,6 +104,16 @@
Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in);
Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does then shift access vector to access elements within bounds
+ const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+ in.ptr -= shift * in.stride_x;
+ out.ptr -= shift * out.stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
// Load data
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
@@ -110,4 +121,8 @@
// Store result
VSTORE(VEC_SIZE)
(data, 0, (__global DATA_TYPE *)out.ptr);
+#else // defined(VEC_SIZE)
+ *((__global DATA_TYPE *)(out.ptr)) = *((__global DATA_TYPE *)(in.ptr));
+#endif // defined(VEC_SIZE)
}
+#endif // defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/crop_tensor.cl
new file mode 100644
index 0000000..55f8544
--- /dev/null
+++ b/src/core/CL/cl_kernels/crop_tensor.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) // Compile time constants
+
+/** Performs a copy of input tensor to the output tensor.
+ *
+ * @param[in] in_ptr Pointer to the source tensor. Supported data types: U16/S16/F16/U32/S32/F32
+ * @param[in] in_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] in_offset_y The initial offset of the input address along Y.
+ * @param[in] in_offset_z The initial offset of the input address along Z.
+ */
+__kernel void crop_tensor(
+ TENSOR3D_DECLARATION(in),
+ TENSOR3D_DECLARATION(out),
+ int in_offset_y,
+ int in_offset_z)
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ const int in_x = get_global_id(0) * (in_step_x / in_stride_x);
+
+#if defined(WIDTH_FLIPPED)
+ const int in_y = in_offset_y - get_global_id(1);
+#else // defined(WIDTH_FLIPPED)
+ const int in_y = in_offset_y + get_global_id(1);
+#endif // defined(WIDTH_FLIPPED)
+
+#if defined(HEIGHT_FLIPPED)
+ const int in_z = in_offset_z - get_global_id(2);
+#else // defined(HEIGHT_FLIPPED)
+ const int in_z = in_offset_z + get_global_id(2);
+#endif // defined(HEIGHT_FLIPPED)
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does then shift access vector to access elements within bounds
+ const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+ in.ptr -= shift * in.stride_x;
+ out.ptr -= shift * out.stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+ __global const uchar *input_addr = tensor3D_offset(&in, in_x, in_y, in_z);
+
+ // Load data
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)), 0, (__global float *)out.ptr);
+#else // defined(VEC_SIZE)
+ *((__global float *)(out.ptr)) = CONVERT(*((__global DATA_TYPE *)tensor3D_offset(&in, in_x, in_y, in_z)), float);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // defined(DATA_TYPE) && defined(LAST_ACCESSED_X)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
index e5169f9..ea2455c 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/deconvolution_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,3 +52,79 @@
// Store result
*((__global DATA_TYPE *)dst.ptr) = *((__global DATA_TYPE *)src.ptr);
}
+
+#if defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+/** This kernel reshapes the deconvolution output tensor before returning the result of the Deconvolution. The decovnolution output tensor
+ * is the result of a @ref CLGEMM operation between the deconvolution input and the deconvolution filter
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g., -DDATA_TYPE=F32
+ * @note The width of the filter should be given as a preprocessor argument using -DFILTER_WIDTH=width, e.g., -DFILTER_WIDTH=2
+ * @note The height of the filter should be given as a preprocessor argument using -DFILTER_HEIGHT=height, e.g., -DFILTER_HEIGHT=2
+ * @note The width of the input should be given as a preprocessor argument using -DSRC_WIDTH=width, e.g., -DSRC_WIDTH=10
+ * @note The height of the input should be given as a preprocessor argument using -DSRC_HEIGHT=width, e.g., -DSRC_HEIGHT=10
+ * @note The output data layout is NHWC if the preprocessor argument NUM_FILTERS is defined, NCHW if NUM_FILTERS is not defined
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] bias_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32/S32
+ * @param[in] bias_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] bias_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+__kernel void deconvolution_reshape(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst)
+#if defined(ADD_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(ADD_BIAS)
+)
+{
+#define FILTER_AREA ((FILTER_WIDTH) * (FILTER_HEIGHT))
+
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+ const DATA_TYPE data = *(__global DATA_TYPE *)src.ptr;
+
+ // Store result
+ const int x_in = get_global_id(0);
+ const int y_in = get_global_id(1);
+ const int z_in = get_global_id(2);
+
+#if defined(NUM_FILTERS)
+ const int bias_index = x_in / (FILTER_AREA);
+ const int z_out = bias_index + (NUM_FILTERS) * (z_in / (SRC_HEIGHT));
+ const int x_out = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+ const int y_out = (FILTER_HEIGHT) * (z_in % (SRC_HEIGHT)) + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+#else // defined(NUM_FILTERS)
+ const int x_out = x_in / (FILTER_AREA);
+ const int y_out = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+ const int z_out = (FILTER_HEIGHT) * z_in + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+ const int bias_index = x_out;
+#endif // defined(NUM_FILTERS)
+
+#if defined(ADD_BIAS)
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+ const DATA_TYPE bias_val = *(__global DATA_TYPE *)vector_offset(&bias, bias_index);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data + bias_val;
+#else // defined(ADD_BIAS)
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data;
+#endif // defined(ADD_BIAS)
+
+#undef FILTER_AREA
+}
+#endif // defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 4f6fdfa..a8611af 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -24,7 +24,141 @@
#include "helpers.h"
-#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+#if defined(FUSED_ACTIVATION)
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SELECT_TYPE VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+#include "activation_helpers.h"
+#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(FUSED_ACTIVATION) */
+
+/** Get the pointer position at a certain offset in x and y direction.
+ *
+ * @param[in] ptr Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
+ *
+ * @return a uchar
+ */
+inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
+{
+ return ptr + x * stride_x + y * stride_y;
+}
+
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
+ })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
+ acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
+ acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
+ acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
+ acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
+ acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
+ acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
+ })
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
+ })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
+ acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
+ acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
+ acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
+ acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
+ acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
+ acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
+ })
+
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
+ })
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
+ })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
+ acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
+ acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
+ acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
+ acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
+ acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
+ acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
+ })
+
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
+ ({ \
+ acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
+ acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
+ acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
+ acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
+ acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
+ acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
+ acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
+ acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
+ acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
+ acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
+ acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
+ acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
+ })
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
+#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
#if defined(CONV_STRIDE_X)
#if CONV_STRIDE_X == 1
@@ -51,13 +185,18 @@
const float middle_coeff,
const float right_coeff)
{
+#if(DILATION_X == 1 && DILATION_Y == 1)
float4 temp = vload4(0, (__global float *)left_pixel);
float2 left = CONVERT(temp.s01, float2);
float2 middle = CONVERT(temp.s12, float2);
float2 right = CONVERT(temp.s23, float2);
-
return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+ return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
+ + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
+ + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
+#endif /* DILATION_X==1 && DILATION_Y==1 */
}
/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
@@ -74,6 +213,7 @@
const float middle_coeff,
const float right_coeff)
{
+#if(DILATION_X == 1 && DILATION_Y == 1)
float4 temp0 = vload4(0, (__global float *)left_pixel);
float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
@@ -82,6 +222,14 @@
float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+ __global float *left_pixel_float = (__global float *)left_pixel;
+
+ return vload4(0, left_pixel_float).s02 * (float2)left_coeff
+ + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
+ + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
}
/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
@@ -98,6 +246,7 @@
const float middle_coeff,
const float right_coeff)
{
+#if(DILATION_X == 1 && DILATION_Y == 1)
float4 temp0 = vload4(0, (__global float *)left_pixel);
float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
@@ -106,6 +255,13 @@
float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+ __global float *left_pixel_float = (__global float *)left_pixel;
+
+ return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
+ + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
+ + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
+#endif /* DILATION_X==1 && DILATION_Y==1 */
}
/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
@@ -139,8 +295,8 @@
float2 pixels;
pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
- pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
- pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
+ pixels += convolution1x3(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
+ pixels += convolution1x3(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
return pixels;
}
@@ -212,65 +368,99 @@
pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
#endif //defined(HAS_BIAS)
- vstore2(pixels, 0, (__global float *)dst.ptr);
+ vstore2(ACTIVATION_FUNC(pixels), 0, (__global float *)dst.ptr);
}
#endif //defined(CONV_STRIDE_X)
-#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
- ({ \
- acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
- acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
- acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
- acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
- acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
- acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
- })
+#if(DILATION_X > 1 || DILATION_Y > 1)
-#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
- ({ \
- acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
- acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
- acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
- acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
- acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
- acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
- acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
- acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
- acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
- acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
- acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
- acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
- })
+/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
+ *
+ * @param[in] src_addr Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+ const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+ // Load the weights
+ float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+ float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+ float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
- ({ \
- acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
- acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
- acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
- acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
- acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
- acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
- })
+ float2 pixels0 = 0.0f;
-#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
- ({ \
- acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
- acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
- acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
- acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
- acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
- acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
- acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
- acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
- acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
- acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
- acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
- acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
- })
+ float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+ float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+ float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+ float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+ float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+ float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+ float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+ float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+ float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+ CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+ CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+ CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+ return pixels0;
+}
+
+/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
+ *
+ * @param[in] src_addr Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+ const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+ // Load the weights
+ float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+ float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+ float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+ float2 pixels0 = 0.0f;
+
+ float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+ float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+ float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+ float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+ float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+ float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+ float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+ float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+ float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+ CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+ CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+ CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+ return pixels0;
+}
+
+#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
* stride_x and stride_y are equal to 1
*
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=float
+ *
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -326,6 +516,7 @@
__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
__global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+#if(DILATION_X == 1 && DILATION_Y == 1)
// Load the weights
float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
@@ -352,6 +543,19 @@
CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+ //3x3 Convolution of elements starting in 0th row
+ pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+ //3x3 Convolution of elements starting in 1st row
+ pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
+ //3x3 Convolution of elements starting in 2nd row
+ pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+ //3x3 Convolution of elements starting in 3rd row
+ pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
#ifdef HAS_BIAS
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
@@ -363,15 +567,21 @@
pixels3 += (float2)bias;
#endif /* defined(HAS_BIAS) */
- vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
- vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
- vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
- vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+ vstore2(ACTIVATION_FUNC(pixels0), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+ vstore2(ACTIVATION_FUNC(pixels1), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+ vstore2(ACTIVATION_FUNC(pixels2), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+ vstore2(ACTIVATION_FUNC(pixels3), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
}
/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
* stride_x and stride_y are equal to 2
*
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=float
+ *
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -425,6 +635,8 @@
__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
__global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
// Load the weights
float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
@@ -449,6 +661,14 @@
CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+ //3x3 Convolution of elements starting in 0th row
+ pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+ //3x3 Convolution of elements starting in 2nd row
+ pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
#ifdef HAS_BIAS
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
@@ -458,11 +678,11 @@
pixels1 += (float2)bias;
#endif /* defined(HAS_BIAS) */
- vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
- vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+ vstore2(ACTIVATION_FUNC(pixels0), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+ vstore2(ACTIVATION_FUNC(pixels1), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
}
-#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
/** Reshape the weights for quantized depthwise convolution
@@ -632,11 +852,12 @@
}
#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
-#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
+#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER) && defined(DILATION_X) && defined(DILATION_Y)
/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -661,7 +882,7 @@
const int src_pixel_linear = get_global_id(1) * STRIDE_X;
const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
- const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
+ const int max_initial_x = STRIDE_X * (((full_length - (KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1))) / STRIDE_X) + 1);
const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
@@ -670,9 +891,9 @@
__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
- for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
+ for(int y = src_y; y < src_y + KERNEL_HEIGHT + (KERNEL_HEIGHT - 1) * (DILATION_Y - 1); y += DILATION_Y)
{
- for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
+ for(int x = src_x; x < src_x + KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1); x += DILATION_X, ++output_ptr)
{
if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
{
@@ -728,7 +949,7 @@
#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
#if defined(CONV_STRIDE_X)
#if CONV_STRIDE_X == 1
#define convolution1x3_f16 convolution1x3_stride_1_f16
@@ -740,6 +961,86 @@
#error "Stride not supported"
#endif /* CONV_STRIDE_X */
+#if(DILATION_X > 1 || DILATION_Y > 1)
+
+/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
+ *
+ * @param[in] src_addr Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+ const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+ // Load the weights
+ half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+ half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+ half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+ half4 pixels0 = 0.0f;
+
+ half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+ half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+ half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+ half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+ half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+ half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+ half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+ half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+ half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+ return pixels0;
+}
+
+/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
+ *
+ * @param[in] src_addr Pointer to the starting position of where to perform the convolution
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] y_offset Offset from the source tensor from which to start convolution
+ * @param[in] weights_addr Pointer from where to get weights
+ * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
+ */
+inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
+ const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
+{
+ // Load the weights
+ half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+ half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+ half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+ half4 pixels0 = 0.0f;
+
+ half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
+ half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+ half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
+
+ half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
+ half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+ half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
+
+ half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
+ half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+ half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
+
+ CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
+ CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
+
+ return pixels0;
+}
+
+#endif // (DILATION_X > 1 && DILATION_Y > 1)
+
/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
*
* @param[in] left_pixel Pointer to the left pixel.
@@ -754,6 +1055,8 @@
const half middle_coeff,
const half right_coeff)
{
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
half8 temp = vload8(0, (__global half *)left_pixel);
half4 left = CONVERT(temp.s0123, half4);
@@ -761,6 +1064,12 @@
half4 right = CONVERT(temp.s2345, half4);
return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+ return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
+ + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
+ + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
}
/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
@@ -777,6 +1086,8 @@
const half middle_coeff,
const half right_coeff)
{
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
half8 temp0 = vload8(0, (__global half *)left_pixel);
half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
@@ -785,6 +1096,15 @@
half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+ __global half *left_pixel_float = (__global half *)left_pixel;
+
+ return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
+ + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
+ + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
}
/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
@@ -801,6 +1121,8 @@
const half middle_coeff,
const half right_coeff)
{
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
half16 temp0 = vload16(0, (__global half *)left_pixel);
half4 left = CONVERT(temp0.s0369, half4);
@@ -808,6 +1130,15 @@
half4 right = CONVERT(temp0.s258B, half4);
return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+ __global half *left_pixel_float = (__global half *)left_pixel;
+
+ return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
+ + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
+ + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
}
/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
@@ -841,8 +1172,8 @@
half4 pixels;
pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
- pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
- pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
+ pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
+ pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
return pixels;
}
@@ -851,6 +1182,12 @@
/** This OpenCL kernel computes the depthwise convolution 3x3
*
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
+ *
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -875,7 +1212,7 @@
* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
- * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
+ * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16
* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
@@ -916,7 +1253,7 @@
pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
#endif //defined(HAS_BIAS)
- vstore4(pixels, 0, (__global half *)dst.ptr);
+ vstore4(ACTIVATION_FUNC(pixels), 0, (__global half *)dst.ptr);
}
#endif // defined(DEPTH_MULTIPLIER)
#endif // defined(CONV_STRIDE_X)
@@ -924,6 +1261,12 @@
/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
* when both stride_x and stride_y are equal to 1
*
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
+ *
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -986,6 +1329,7 @@
__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
__global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+#if(DILATION_X == 1 && DILATION_Y == 1)
// Load the weights
half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
@@ -1012,6 +1356,19 @@
CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
+#else /* DILATION_X==1 && DILATION_Y==1 */
+
+ //3x3 Convolution of elements starting in 0th row
+ pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+ //3x3 Convolution of elements starting in 1st row
+ pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
+ //3x3 Convolution of elements starting in 2nd row
+ pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+ //3x3 Convolution of elements starting in 3rd row
+ pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
+
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
#ifdef HAS_BIAS
pixels0 += (half4)bias;
pixels1 += (half4)bias;
@@ -1019,15 +1376,21 @@
pixels3 += (half4)bias;
#endif /* defined(HAS_BIAS) */
- vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
- vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
- vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
- vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(pixels0), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(pixels1), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(pixels2), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(pixels3), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
}
/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
* when both stride_x and stride_y are equal to 2
*
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
+ *
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1088,6 +1451,8 @@
__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
__global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+#if(DILATION_X == 1 && DILATION_Y == 1)
+
// Load the weights
half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
@@ -1112,15 +1477,22 @@
CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
+#else /* DILATION_X==1 && DILATION_Y==1 */
+ //3x3 Convolution of elements starting in 0th row
+ pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
+ //3x3 Convolution of elements starting in 2nd row
+ pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+#endif /* DILATION_X==1 && DILATION_Y==1 */
+
#ifdef HAS_BIAS
pixels0 += (half4)bias;
pixels1 += (half4)bias;
#endif /* defined(HAS_BIAS) */
- vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
- vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(pixels0), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(ACTIVATION_FUNC(pixels1), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
@@ -1140,8 +1512,12 @@
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1161,7 +1537,7 @@
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -1189,9 +1565,9 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
@@ -1203,7 +1579,7 @@
int z_coord = 0;
int4 offset = 0;
- int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
+ int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
// We compute 2x1x1 [C,W,H] elements
VEC_FLOAT acc = 0;
@@ -1236,16 +1612,16 @@
// z == 1
// z_coord can be only negative for z = 0 so we do not need to clamp it
// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
- z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
+ z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
offset = y_offset + (int4)(z_coord * src_stride_z);
VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
// z == 2
- // After z = 1 we can simply add src_stride_z to offset without updating z_coord
- // However offset can be out-of-bound so we need to check if it is greater than max_offset
- offset += (int4)src_stride_z;
+ // Offset can be out-of-bound so we need to check if it is greater than max_offset
+ z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
+ offset = y_offset + (int4)(z_coord * src_stride_z);
offset = min(offset, (int4)max_offset);
VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
@@ -1276,21 +1652,26 @@
#endif /* defined(DST_DEPTH) */
VSTORE(VEC_SIZE)
- (acc, 0, (__global DATA_TYPE *)(dst_addr));
+ (ACTIVATION_FUNC(acc), 0, (__global DATA_TYPE *)(dst_addr));
}
#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
*
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
+ * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1310,7 +1691,7 @@
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -1338,9 +1719,9 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
@@ -1476,18 +1857,18 @@
#endif /* defined(DST_DEPTH) */
VSTORE(VEC_SIZE)
- (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+ (ACTIVATION_FUNC(acc0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
VSTORE(VEC_SIZE)
- (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+ (ACTIVATION_FUNC(acc1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
{
VSTORE(VEC_SIZE)
- (acc2, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
+ (ACTIVATION_FUNC(acc2), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
VSTORE(VEC_SIZE)
- (acc3, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
+ (ACTIVATION_FUNC(acc3), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
}
}
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 606af2e..8d145a0 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -53,6 +53,8 @@
#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
+#if DILATION_X == 1
+
#if CONV_STRIDE_X == 1
#define GET_VALUES(first_value, left, middle, right) \
({ \
@@ -85,6 +87,46 @@
})
#endif /* CONV_STRIDE_X */
+#else /* DILATION_X == 1 */
+
+#if CONV_STRIDE_X == 1
+#define GET_VALUES(first_value, left, middle, right) \
+ ({ \
+ left = CONVERT(vload8(0, first_value), int8); \
+ middle = CONVERT(vload8(0, first_value + DILATION_X * sizeof(uchar)), int8); \
+ right = CONVERT(vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)), int8); \
+ })
+#elif CONV_STRIDE_X == 2
+#define GET_VALUES(first_value, left, middle, right) \
+ ({ \
+ int16 temp0 = CONVERT(vload16(0, first_value), int16); \
+ left = CONVERT(temp0.s02468ace, int8); \
+ \
+ temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \
+ middle = CONVERT(temp0.s02468ace, int8); \
+ \
+ temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \
+ right = CONVERT(temp0.s02468ace, int8); \
+ })
+#else /* CONV_STRIDE_X */
+#define GET_VALUES(first_value, left, middle, right) \
+ ({ \
+ int16 temp0 = CONVERT(vload16(0, first_value), int16); \
+ int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \
+ left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
+ \
+ temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \
+ temp1 = CONVERT(vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))), int8); \
+ middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
+ \
+ temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \
+ temp1 = CONVERT(vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))), int8); \
+ right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
+ })
+
+#endif /* CONV_STRIDE_X */
+#endif /* DILATION_X==1 */
+
/** This function computes the depthwise convolution quantized.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
@@ -117,7 +159,7 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_convolution_3x3_quantized_nchw(
+__kernel void dwc_3x3_native_qasymm8_nchw(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights)
@@ -151,10 +193,10 @@
int8 values0 = 0;
int8 sum0 = 0;
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
int8 values1 = 0;
int8 sum1 = 0;
-#endif /* CONV_STRIDE_Y */
+#endif /* CONV_STRIDE_Y &&DILATION_Y==1 */
// Row0
int8 left, middle, right;
@@ -168,44 +210,44 @@
#endif /* WEIGHTS_OFFSET != 0 */
// Row1
- GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);
+ GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left, middle, right);
values0 += left * (int8)(w1.s0);
values0 += middle * (int8)(w1.s1);
values0 += right * (int8)(w1.s2);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += left * (int8)(w0.s0);
values1 += middle * (int8)(w0.s1);
values1 += right * (int8)(w0.s2);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y && DILATION_Y== 1 */
#if WEIGHTS_OFFSET != 0
int8 tmp = left + middle + right;
sum0 += tmp;
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
sum1 += tmp;
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y &&DILATION_Y== 1 */
#endif /* WEIGHTS_OFFSET != 0 */
// Row2
- GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);
+ GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left, middle, right);
values0 += left * (int8)(w2.s0);
values0 += middle * (int8)(w2.s1);
values0 += right * (int8)(w2.s2);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += left * (int8)(w1.s0);
values1 += middle * (int8)(w1.s1);
values1 += right * (int8)(w1.s2);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y &&DILATION_Y == 1 */
#if WEIGHTS_OFFSET != 0
tmp = left + middle + right;
sum0 += tmp;
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
sum1 += tmp;
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
#endif /* WEIGHTS_OFFSET != 0 */
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
// Row3
GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
values1 += left * (int8)(w2.s0);
@@ -215,20 +257,20 @@
#if WEIGHTS_OFFSET != 0
sum1 += left + middle + right;
#endif /* WEIGHTS_OFFSET != 0 */
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y && DILATION_Y == 1 */
#if defined(HAS_BIAS)
values0 += (int8)(bias_value);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += (int8)(bias_value);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y & &DILATION_Y == 1 */
#endif //defined(HAS_BIAS)
#if WEIGHTS_OFFSET != 0
values0 += sum0 * (int8)(WEIGHTS_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += sum1 * (int8)(WEIGHTS_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
#endif /* WEIGHTS_OFFSET != 0 */
#if INPUT_OFFSET != 0
@@ -236,16 +278,16 @@
ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
values0 += sum_weights * (int8)(INPUT_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += sum_weights * (int8)(INPUT_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
#endif /* INPUT_OFFSET != 0 */
#if K_OFFSET != 0
values0 += (int8)(K_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += (int8)(K_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
#endif /* K_OFFSET != 0 */
#if defined(REAL_MULTIPLIER)
@@ -254,7 +296,7 @@
#else // defined(REAL_MULTIPLIER)
- values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+ values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
#endif // defined(REAL_MULTIPLIER)
@@ -264,14 +306,14 @@
res0 = min(res0, (uchar8)255);
vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
#if defined(REAL_MULTIPLIER)
values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
#else // defined(REAL_MULTIPLIER)
- values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+ values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
#endif // defined(REAL_MULTIPLIER)
@@ -281,11 +323,11 @@
res1 = min(res1, (uchar8)255);
vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
}
#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
-
+#if DILATION_X == 1
#if CONV_STRIDE_X == 1
#define GET_VALUES(first_value, left, middle, right) \
({ \
@@ -317,6 +359,43 @@
right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \
})
#endif /* CONV_STRIDE_X */
+#else /*DILATION_X==1*/
+
+#if CONV_STRIDE_X == 1
+#define GET_VALUES(first_value, left, middle, right) \
+ ({ \
+ left = vload8(0, first_value); \
+ middle = vload8(0, first_value + DILATION_X * sizeof(uchar)); \
+ right = vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
+ })
+#elif CONV_STRIDE_X == 2
+#define GET_VALUES(first_value, left, middle, right) \
+ ({ \
+ uchar16 temp0 = vload16(0, first_value); \
+ left = temp0.s02468ace; \
+ temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \
+ middle = temp0.s02468ace; \
+ temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
+ right = temp0.s02468ace; \
+ })
+#else /* CONV_STRIDE_X */
+#define GET_VALUES(first_value, left, middle, right) \
+ ({ \
+ uchar16 temp0 = vload16(0, first_value); \
+ uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \
+ left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
+ \
+ temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \
+ temp1 = vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))); \
+ middle = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
+ \
+ temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
+ temp1 = vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))); \
+ right = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
+ })
+
+#endif /* CONV_STRIDE_X */
+#endif /*DILATION_X==1*/
/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
@@ -349,7 +428,7 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
-__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
+__kernel void dwc_3x3_native_qasymm8_dot8_nchw(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights)
@@ -389,8 +468,8 @@
int8 sum0 = 0;
GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);
- GET_VALUES(src.ptr + 1 * src_stride_y, left1, middle1, right1);
- GET_VALUES(src.ptr + 2 * src_stride_y, left2, middle2, right2);
+ GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left1, middle1, right1);
+ GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
#if WEIGHTS_OFFSET != 0
sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
@@ -398,7 +477,7 @@
sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
#endif /* WEIGHTS_OFFSET != 0 */
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
// If conv_stride_y is equals to 1, we compute two output rows
uchar8 left3, middle3, right3;
@@ -412,7 +491,7 @@
sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
#endif /* WEIGHTS_OFFSET != 0 */
-#endif // CONV_STRIDE_Y == 1
+#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
@@ -446,7 +525,7 @@
ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
values0.s7 += right2.s7 * w2.s2;
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
values1.s0 += right3.s0 * w2.s2;
@@ -478,20 +557,20 @@
ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
values1.s7 += right3.s7 * w2.s2;
-#endif // CONV_STRIDE_Y == 1
+#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
#if defined(HAS_BIAS)
values0 += (int8)(bias_value);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += (int8)(bias_value);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
#endif //defined(HAS_BIAS)
#if WEIGHTS_OFFSET != 0
values0 += sum0 * (int8)(WEIGHTS_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += sum1 * (int8)(WEIGHTS_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
#endif /* WEIGHTS_OFFSET != 0 */
#if INPUT_OFFSET != 0
@@ -499,16 +578,16 @@
ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
values0 += sum_weights * (int8)(INPUT_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += sum_weights * (int8)(INPUT_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
#endif /* INPUT_OFFSET != 0 */
#if K_OFFSET != 0
values0 += (int8)(K_OFFSET);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += (int8)(K_OFFSET);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
#endif /* K_OFFSET != 0 */
#if defined(REAL_MULTIPLIER)
@@ -527,7 +606,7 @@
res0 = min(res0, (uchar8)255);
vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
-#if CONV_STRIDE_Y == 1
+#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
#if defined(REAL_MULTIPLIER)
@@ -545,7 +624,7 @@
res1 = min(res1, (uchar8)255);
vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
-#endif /* CONV_STRIDE_Y == 1 */
+#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
}
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
@@ -597,9 +676,10 @@
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
+#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4
/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
*
+ * @note This kernel assumes VEC_SIZE is 4.
* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -640,7 +720,7 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
* @param[in] max_offset Max offset for the input tensor
*/
-__kernel void depthwise_convolution_3x3_quantized_nhwc(
+__kernel void dwc_3x3_reshaped_qasymm8_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
IMAGE_DECLARATION(weights),
@@ -654,9 +734,9 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -668,7 +748,7 @@
int z_coord = 0;
int4 offset = 0;
- int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
+ int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3)) - (int)CONV_PAD_LEFT;
// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
@@ -682,15 +762,19 @@
VEC_INT acc = 0, sum = 0;
// Load weights
- VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
- VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
- VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
- VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
- VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
- VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
- VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
- VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
- VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+ uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+ uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+ uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
+
+ uchar4 w0 = w0_tmp.s0123;
+ uchar4 w1 = w0_tmp.s4567;
+ uchar4 w2 = w0_tmp.s89AB;
+ uchar4 w3 = w0_tmp.sCDEF;
+
+ uchar4 w4 = w1_tmp.s0123;
+ uchar4 w5 = w1_tmp.s4567;
+ uchar4 w6 = w1_tmp.s89AB;
+ uchar4 w7 = w1_tmp.sCDEF;
#if INPUT_OFFSET != 0
VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -715,16 +799,16 @@
// z == 1
// z_coord can be only negative for z = 0 so we do not need to clamp it
// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
- z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
+ z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
offset = y_offset + (int4)(z_coord * src_stride_z);
VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
// z == 2
- // After z = 1 we can simply add src_stride_z to offset without updating z_coord
- // However offset can be out-of-bound so we need to check if it is greater than max_offset
- offset += (int4)src_stride_z;
+ // Offset can be out-of-bound so we need to check if it is greater than max_offset
+ z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
+ offset = y_offset + (int4)(z_coord * src_stride_z);
offset = min(offset, (int4)max_offset);
VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -766,7 +850,7 @@
#else // defined(REAL_MULTIPLIER)
- acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
#endif // defined(REAL_MULTIPLIER)
acc += (VEC_INT)OUTPUT_OFFSET;
@@ -785,9 +869,10 @@
}
#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
-#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
+#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4
/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
*
+ * @note This kernel assumes VEC_SIZE is 4.
* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -829,7 +914,7 @@
* @param[in] max_offset Max offset for the input tensor
*/
-__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
IMAGE_DECLARATION(weights),
@@ -843,9 +928,9 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -874,15 +959,19 @@
VEC_INT acc3 = 0, sum3 = 0;
// Load weights
- VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
- VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
- VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
- VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
- VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
- VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
- VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
- VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
- VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+ uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+ uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+ uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
+
+ uchar4 w0 = w0_tmp.s0123;
+ uchar4 w1 = w0_tmp.s4567;
+ uchar4 w2 = w0_tmp.s89AB;
+ uchar4 w3 = w0_tmp.sCDEF;
+
+ uchar4 w4 = w1_tmp.s0123;
+ uchar4 w5 = w1_tmp.s4567;
+ uchar4 w6 = w1_tmp.s89AB;
+ uchar4 w7 = w1_tmp.sCDEF;
#if INPUT_OFFSET != 0
VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -1020,10 +1109,10 @@
#else // defined(REAL_MULTIPLIER)
- acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
#endif // defined(REAL_MULTIPLIER)
@@ -1110,7 +1199,7 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
* @param[in] max_offset The maximum allowed offset for the input tensor
*/
-__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc(
TENSOR4D_DECLARATION(src),
TENSOR4D_DECLARATION(dst),
IMAGE_DECLARATION(weights),
@@ -1124,9 +1213,9 @@
#if defined(DST_DEPTH)
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else // defined(DST_DEPTH)
+ int z = get_global_id(2); // spatial coordinate y
+#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -1255,8 +1344,8 @@
#else // defined(REAL_MULTIPLIER)
- acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
#endif // defined(REAL_MULTIPLIER)
acc0 += (VEC_INT)OUTPUT_OFFSET;
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
index 4908bb0..7307700 100644
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,51 +23,68 @@
*/
#include "helpers.h"
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
+
/** This performs the dequantization of 8-bit unsigned integers to floating point.
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] min_max_ptr Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Suppported data types: F32.
- * @param[in] min_max_stride_x Stride of the min/max vector in X dimension (in bytes)
- * @param[in] min_max_step_x min_max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] min_max_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void dequantization_layer(
TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output),
- VECTOR_DECLARATION(min_max))
+ TENSOR3D_DECLARATION(output))
{
// Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
- Vector min_max = CONVERT_TO_VECTOR_STRUCT(min_max);
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
- // min_max_value.s0 = min, min_max_value.s1 = max
- const float2 min_max_value = vload2(0, (__global float *)min_max.ptr);
-
- const float4 vmin = (float4)min_max_value.s0;
- const float4 scale = (float4)((min_max_value.s1 - min_max_value.s0) / 255.0f);
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+ output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
// Load data
- const uchar4 data = vload4(0, (__global uchar *)input.ptr);
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ val = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+ // Create scale and offset vectors
+ const VEC_DATA_TYPE(float, VEC_SIZE)
+ vscale = SCALE;
+
+ const VEC_DATA_TYPE(int, VEC_SIZE)
+ voffset = OFFSET;
// Dequantize
- const float4 res = convert_float4(data) * scale + vmin;
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
// Store result
- vstore4(res, 0, (__global float *)output.ptr);
+ VSTORE(VEC_SIZE)
+ (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE *)(output.ptr)) = (DATA_TYPE)((float)((int)(*((__global uchar *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
}
+
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/fft.cl
new file mode 100644
index 0000000..0027fd5
--- /dev/null
+++ b/src/core/CL/cl_kernels/fft.cl
@@ -0,0 +1,1771 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculates and applies the twiddle factor to a given input.
+ *
+ * @param[in] phi The angle.
+ * @param[in,out] input The input on which the factor should be applied.
+ */
+#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input) \
+ { \
+ float2 w, tmp; \
+ w.x = native_cos(phi); \
+ w.y = native_sin(phi); \
+ tmp.x = (w.x * input.x) - (w.y * input.y); \
+ tmp.y = (w.x * input.y) + (w.y * input.x); \
+ input = tmp; \
+ }
+
+/** Computes radix-2 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ */
+#define DFT_2(c0, c1) \
+ { \
+ float2 v0; \
+ v0 = c0; \
+ c0 = v0 + c1; \
+ c1 = v0 - c1; \
+ }
+
+// radix-3 butterfly unit factors
+#define SQRT3DIV2 0.86602540378443f
+
+/** Computes radix-3 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ */
+#define DFT_3(c0, c1, c2) \
+ { \
+ float2 v0 = c1 + c2; \
+ float2 v1 = c1 - c2; \
+ c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
+ c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
+ c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
+ c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
+ c0 = c0 + v0; \
+ }
+
+/**Computes radix-4 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ */
+#define DFT_4(c0, c1, c2, c3) \
+ { \
+ float2 v0, v1, v2, v3; \
+ v0 = c0 + c2; \
+ v1 = c1 + c3; \
+ v2 = c0 - c2; \
+ v3.x = c1.y - c3.y; \
+ v3.y = c3.x - c1.x; \
+ c0 = v0 + v1; \
+ c2 = v0 - v1; \
+ c1 = v2 + v3; \
+ c3 = v2 - v3; \
+ }
+
+// radix-5 butterfly unit factors
+#define W5_A 0.30901699437494f
+#define W5_B 0.95105651629515f
+#define W5_C 0.80901699437494f
+#define W5_D 0.58778525229247f
+
+/** Computes radix-5 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ */
+#define DFT_5(c0, c1, c2, c3, c4) \
+ { \
+ float2 v0, v1, v2, v3, v4; \
+ v0 = c0; \
+ v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \
+ v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \
+ v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \
+ v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \
+ c0 = v0 + c1 + c2 + c3 + c4; \
+ c1 = v0 + v1 + (float2)(v4.y, -v4.x); \
+ c2 = v0 - v2 + (float2)(v3.y, -v3.x); \
+ c3 = v0 - v2 + (float2)(-v3.y, v3.x); \
+ c4 = v0 + v1 + (float2)(-v4.y, v4.x); \
+ }
+
+// radix-7 butterfly unit factors
+#define W7_A 0.62348980185873f
+#define W7_B 0.78183148246802f
+#define W7_C 0.22252093395631f
+#define W7_D 0.97492791218182f
+#define W7_E 0.90096886790241f
+#define W7_F 0.43388373911755f
+
+/** Computes radix-7 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ */
+#define DFT_7(c0, c1, c2, c3, c4, c5, c6) \
+ { \
+ float2 v0, v1, v2, v3, v4, v5, v6; \
+ v0 = c0; \
+ v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
+ v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
+ v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
+ v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
+ v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
+ v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
+ c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6; \
+ c1 = v0 + v1 + (float2)(v4.y, -v4.x); \
+ c2 = v0 - v2 + (float2)(v5.y, -v5.x); \
+ c3 = v0 - v3 + (float2)(v6.y, -v6.x); \
+ c4 = v0 - v3 + (float2)(-v6.y, v6.x); \
+ c5 = v0 - v2 + (float2)(-v5.y, v5.x); \
+ c6 = v0 + v1 + (float2)(-v4.y, v4.x); \
+ }
+
+/** Computes radix-8 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ * @param[in,out] c7 Complex input 7.
+ */
+#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \
+ { \
+ float2 v0, v1, v2, v3, v4, v5, v6, v7; \
+ float2 s0, s1, s2, s3, s4, s5, s6, s7; \
+ float2 t0, t1, t2; \
+ v0 = c0 + c4; \
+ v1 = c1 + c5; \
+ v2 = c2 + c6; \
+ v3 = c3 + c7; \
+ v4 = c0 - c4; \
+ v5 = c1 - c5; \
+ v6 = c2 - c6; \
+ v7 = c3 - c7; \
+ s0 = v0 + v2; \
+ s1 = v1 + v3; \
+ s2 = v0 - v2; \
+ s3 = v1 - v3; \
+ s4.x = v4.x - v6.y; \
+ s4.y = v4.y + v6.x; \
+ s5.x = v5.x - v7.y; \
+ s5.y = v5.y + v7.x; \
+ s6.x = v4.x + v6.y; \
+ s6.y = v4.y - v6.x; \
+ s7.x = v5.x + v7.y; \
+ s7.y = v5.y - v7.x; \
+ t0.x = -s3.y; \
+ t0.y = s3.x; \
+ t1.x = M_SQRT1_2_F * (s5.x - s5.y); \
+ t1.y = M_SQRT1_2_F * (s5.x + s5.y); \
+ t2.x = -M_SQRT1_2_F * (s7.x + s7.y); \
+ t2.y = M_SQRT1_2_F * (s7.x - s7.y); \
+ c0 = s0 + s1; \
+ c1 = s6 - t2; \
+ c2 = s2 - t0; \
+ c3 = s4 - t1; \
+ c4 = s0 - s1; \
+ c5 = s6 + t2; \
+ c6 = s2 + t0; \
+ c7 = s4 + t1; \
+ }
+
+/** Computes the first stage of a radix-2 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_2_first_stage_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load two complex input values
+ float4 data = vload4(0, (__global float *)input.ptr);
+
+ // Compute DFT N = 2
+ DFT_2(data.s01, data.s23);
+
+ // Store two complex output values
+ vstore4(data, 0, (__global float *)output.ptr);
+}
+
+/** Computes the first stage of a radix-2 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_2_first_stage_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load two complex input values
+ float2 data1 = vload2(0, (__global float *)input.ptr);
+ float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+
+ // Compute DFT N = 2
+ DFT_2(data1, data2);
+
+ // Store two complex output values
+ vstore2(data1, 0, (__global float *)output.ptr);
+ vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_3_first_stage_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load three complex input values
+ float4 data0 = vload4(0, (__global float *)input.ptr);
+ float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 2, 0, 0));
+
+ // Compute DFT N = 3
+ DFT_3(data0.s01, data0.s23, data1.s01);
+
+ // Store three complex output values
+ vstore4(data0, 0, (__global float *)output.ptr);
+ vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 2, 0, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_3_first_stage_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load three complex input values
+ float2 data0 = vload2(0, (__global float *)input.ptr);
+ float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+ float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+
+ // Compute DFT N = 3
+ DFT_3(data0, data1, data2);
+
+ // Store three complex output values
+ vstore2(data0, 0, (__global float *)output.ptr);
+ vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+ vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_4_first_stage_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load four complex input values
+ float8 data = vload8(0, (__global float *)input.ptr);
+
+ // Compute DFT N = 4
+ DFT_4(data.s01, data.s23, data.s45, data.s67);
+
+ // Store four complex output values
+ vstore8(data, 0, (__global float *)output.ptr);
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_4_first_stage_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load four complex input values
+ float2 data0 = vload2(0, (__global float *)input.ptr);
+ float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+ float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+ float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+
+ // Compute DFT N = 4
+ DFT_4(data0, data1, data2, data3);
+
+ // Store four complex output values
+ vstore2(data0, 0, (__global float *)output.ptr);
+ vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+ vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+ vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_5_first_stage_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load five complex input values
+ float8 data0 = vload8(0, (__global float *)input.ptr);
+ float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
+
+ // Compute DFT N = 5
+ DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
+
+ // Store five complex output values
+ vstore8(data0, 0, (__global float *)output.ptr);
+ vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_5_first_stage_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load five complex input values
+ float2 data0 = vload2(0, (__global float *)input.ptr);
+ float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+ float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+ float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+ float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+
+ // Compute DFT N = 5
+ DFT_5(data0, data1, data2, data3, data4);
+
+ // Store five complex output values
+ vstore2(data0, 0, (__global float *)output.ptr);
+ vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+ vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+ vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+ vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_7_first_stage_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load seven complex input values
+ float8 data0 = vload8(0, (__global float *)input.ptr);
+ float4 data1 = vload4(0, (__global float *)tensor3D_offset(&input, 4, 0, 0));
+ float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 6, 0, 0));
+
+ // Compute DFT N = 7
+ DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
+
+ // Store seven complex output values
+ vstore8(data0, 0, (__global float *)output.ptr);
+ vstore4(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0));
+ vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 6, 0, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_7_first_stage_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load seven complex input values
+ float2 data0 = vload2(0, (__global float *)input.ptr);
+ float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+ float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+ float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+ float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+ float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));
+ float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));
+
+ // Compute DFT N = 7
+ DFT_7(data0, data1, data2, data3, data4, data5, data6);
+
+ // Store seven complex output values
+ vstore2(data0, 0, (__global float *)output.ptr);
+ vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+ vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+ vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+ vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+ vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));
+ vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_8_first_stage_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load eight complex input values
+ float16 data = vload16(0, (__global float *)input.ptr);
+
+ // Compute DFT N = 8
+ DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
+
+ // Store eight complex output values
+ vstore16(data, 0, (__global float *)output.ptr);
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+kernel void fft_radix_8_first_stage_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ // Load eight complex input values
+ float2 data0 = vload2(0, (__global float *)input.ptr);
+ float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+ float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2, 0));
+ float2 data3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3, 0));
+ float2 data4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4, 0));
+ float2 data5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5, 0));
+ float2 data6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6, 0));
+ float2 data7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7, 0));
+
+ // Compute DFT N = 8
+ DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
+
+ // Store eight complex output values
+ vstore2(data0, 0, (__global float *)output.ptr);
+ vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 0, 1, 0));
+ vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 0, 2, 0));
+ vstore2(data3, 0, (__global float *)tensor3D_offset(&output, 0, 3, 0));
+ vstore2(data4, 0, (__global float *)tensor3D_offset(&output, 0, 4, 0));
+ vstore2(data5, 0, (__global float *)tensor3D_offset(&output, 0, 5, 0));
+ vstore2(data6, 0, (__global float *)tensor3D_offset(&output, 0, 6, 0));
+ vstore2(data7, 0, (__global float *)tensor3D_offset(&output, 0, 7, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_2_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-2
+ uint kx = get_global_id(0);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load two complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+ // Compute DFT N = 2
+ DFT_2(c0, c1);
+
+ // Store two complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_2_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-2
+ uint kx = get_global_id(1);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load two complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+ // Compute DFT N = 2
+ DFT_2(c0, c1);
+
+ // Store two complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_3_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-3
+ uint kx = get_global_id(0);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load three complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+ // Compute DFT N = 3
+ DFT_3(c0, c1, c2);
+
+ // Store three complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_3_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-3
+ uint kx = get_global_id(1);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load three complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+ // Compute DFT N = 3
+ DFT_3(c0, c1, c2);
+
+ // Store three complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_4_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-4
+ uint kx = get_global_id(0);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load four complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+ float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+ TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+ // Compute DFT N = 4
+ DFT_4(c0, c1, c2, c3);
+
+ // Store four complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+ vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_4_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-4
+ uint kx = get_global_id(1);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load four complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+ float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+ TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+ // Compute DFT N = 4
+ DFT_4(c0, c1, c2, c3);
+
+ // Store four complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+ vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_5_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-5
+ uint kx = get_global_id(0);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load five complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+ float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+ float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+ TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+ TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+ // Compute DFT N = 5
+ DFT_5(c0, c1, c2, c3, c4);
+
+ // Store five complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+ vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+ vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_5_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-5
+ uint kx = get_global_id(1);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load five complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+ float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+ float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+ TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+ TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+ // Compute DFT N = 5
+ DFT_5(c0, c1, c2, c3, c4);
+
+ // Store five complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+ vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+ vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_7_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-7
+ uint kx = get_global_id(0);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load seven complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+ float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+ float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+ float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+ float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+ TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+ TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+ TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+ TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+ // Compute DFT N = 7
+ DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+ // Store seven complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+ vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+ vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+ vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+ vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_7_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-7
+ uint kx = get_global_id(1);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load seven complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+ float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+ float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+ float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+ float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+ TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+ TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+ TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+ TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+ // Compute DFT N = 7
+ DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+ // Store seven complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+ vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+ vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+ vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+ vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_8_axis_0(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-8
+ uint kx = get_global_id(0);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load eight complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+ float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+ float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+ float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+ float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+ float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 7 * Nx, 0, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+ TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+ TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+ TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+ TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+ TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+ // Compute DFT N = 8
+ DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+ // Store eight complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+ vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+ vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+ vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+ vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+ vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 7 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage
+ * @param[in] Ni Nx * Ny.
+ * @param[in] exp_const Exponent constant
+ */
+kernel void fft_radix_8_axis_1(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+ ,
+ uint Nx, uint Ni, float exp_const)
+{
+ // Each work-item computes a single radix-8
+ uint kx = get_global_id(1);
+
+ // Compute nx
+ uint nx = kx % Nx;
+
+ // Compute n index
+ uint n = nx + (kx / Nx) * Ni;
+
+ // Get tensor pointers
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+ output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+ // Load eight complex input values
+ float2 c0 = vload2(0, (__global float *)input.ptr);
+ float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, 0, Nx, 0));
+ float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+ float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+ float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+ float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+ float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+ float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 0, 7 * Nx, 0));
+
+ // Compute phi
+ float phi = (float)nx * exp_const;
+
+ // Multiply by twiddle factor
+ TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+ TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+ TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+ TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+ TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+ TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+ TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+ // Compute DFT N = 8
+ DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+ // Store eight complex output values
+ vstore2(c0, 0, (__global float *)output.ptr);
+ vstore2(c1, 0, (__global float *)tensor3D_offset(&output, 0, Nx, 0));
+ vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+ vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+ vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+ vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+ vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+ vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 0, 7 * Nx, 0));
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_digit_reverse.cl b/src/core/CL/cl_kernels/fft_digit_reverse.cl
new file mode 100644
index 0000000..040c284
--- /dev/null
+++ b/src/core/CL/cl_kernels/fft_digit_reverse.cl
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE)
+/** Computes the digit reverse stage on axis X
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] idx_ptr Pointer to the index tensor. Supported data types: U32
+ * @param[in] idx_stride_x Stride of the index tensor in X dimension (in bytes)
+ * @param[in] idx_step_x idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_0(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(idx))
+{
+ // Get tensor pointers
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector idx = CONVERT_TO_VECTOR_STRUCT(idx);
+
+ const unsigned int iidx = *((__global uint *)(idx.ptr));
+
+ // Load data
+#if VEC_SIZE == 1
+ float data = *((__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#elif VEC_SIZE == 2
+ float2 data = vload2(0, (__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+ // Create result
+#if VEC_SIZE == 1
+ float2 res = { data, 0 };
+#elif VEC_SIZE == 2
+ float2 res = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+ // Store result
+#if defined(CONJ)
+ vstore2((float2)(res.s0, -res.s1), 0, (__global float *)dst.ptr);
+#else // defined(CONJ)
+ vstore2(res, 0, (__global float *)dst.ptr);
+#endif // defined(CONJ)
+}
+
+/** Computes the digit reverse stage on axis Y
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] idx_ptr Pointer to the index tensor. Supported data types: U32
+ * @param[in] idx_stride_x Stride of the index tensor in X dimension (in bytes)
+ * @param[in] idx_step_x idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_1(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(idx))
+{
+ // Get tensor pointers
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector idx = CONVERT_TO_VECTOR_STRUCT_NO_STEP(idx);
+
+ const unsigned int iidx = *((__global uint *)vector_offset(&idx, (int)(get_global_id(1))));
+
+ // Load data
+#if VEC_SIZE == 1
+ float data = *((__global float *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#elif VEC_SIZE == 2
+ float2 data = vload2(0, (__global float *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+ // Create result
+#if VEC_SIZE == 1
+ float2 res = { data, 0 };
+#elif VEC_SIZE == 2
+ float2 res = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+ // Store result
+#if defined(CONJ)
+ vstore2((float2)(res.s0, -res.s1), 0, (__global float *)dst.ptr);
+#else // defined(CONJ)
+ vstore2(res, 0, (__global float *)dst.ptr);
+#endif // defined(CONJ)
+}
+#endif // defined(VEC_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_scale.cl b/src/core/CL/cl_kernels/fft_scale.cl
new file mode 100644
index 0000000..bf78a26
--- /dev/null
+++ b/src/core/CL/cl_kernels/fft_scale.cl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Computes the fft scale stage
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x (Optional) dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y (Optional) dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z (Optional) dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ * @param[in] scale Scale to apply to the complex value
+ */
+__kernel void fft_scale_conj(
+ TENSOR3D_DECLARATION(src)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(dst)
+#endif /* not IN_PLACE */
+ ,
+ float scale)
+{
+ // Get tensor pointers
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+#if defined(IN_PLACE)
+ Tensor3D dst = src;
+#else /* IN_PLACE */
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#endif /* IN_PLACE */
+
+ // Store result
+#if VEC_SIZE == 1
+ *((__global float *)dst.ptr) = (*(__global float *)src.ptr) / scale;
+#elif VEC_SIZE == 2
+ // Load data
+ float2 data = vload2(0, (__global float *)src.ptr);
+ data /= scale;
+#if defined(CONJ)
+ vstore2((float2)(data.s0, -data.s1), 0, (__global float *)dst.ptr);
+#else // defined(CONJ)
+ vstore2(data, 0, (__global float *)dst.ptr);
+#endif // defined(CONJ)
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 4736f80..da94008 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1128,7 +1128,1149 @@
#endif // defined(TRANSPOSE)
#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE)
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) \
+ ({ \
+ c = fma(a, b, c); \
+ })
+#define ARM_DOT2(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ })
+#define ARM_DOT3(a, b, c) \
+ ({ \
+ ARM_DOT2(a, b, c); \
+ c = fma((a.s2), (b.s2), c); \
+ })
+#define ARM_DOT4(a, b, c) \
+ ({ \
+ ARM_DOT3(a, b, c); \
+ c = fma((a.s3), (b.s3), c); \
+ })
+#define ARM_DOT8(a, b, c) \
+ ({ \
+ ARM_DOT4((a.lo), (b.lo), c); \
+ ARM_DOT4((a.hi), (b.hi), c); \
+ })
+#define ARM_DOT16(a, b, c) \
+ ({ \
+ ARM_DOT8((a.lo), (b.lo), c); \
+ ARM_DOT8((a.hi), (b.hi), c); \
+ })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##4), (c.s4)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##5), (c.s5)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##6), (c.s6)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##7), (c.s7)); \
+ })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##4), (c.s4)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##5), (c.s5)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##6), (c.s6)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##7), (c.s7)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##8), (c.s8)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##9), (c.s9)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##A), (c.sA)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##B), (c.sB)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##C), (c.sC)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##D), (c.sD)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##E), (c.sE)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##F), (c.sF)); \
+ })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix is NOT reshaped
+ * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (i.e. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zin0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+ zin0 *= (lhs_cross_plane_pad * lhs_stride_y);
+#if M0 > 1
+ zin1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+ zin1 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zin2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+ zin2 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zin3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+ zin3 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zin4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+ zin4 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zin5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+ zin5 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zin6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+ zin6 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zin7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+ zin7 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 7
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+ int i = 0;
+ for(; i <= (K - K0); i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+ // Load values from RHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b0 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b1 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#if N0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b2 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 2
+#if N0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b3 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 3
+#if N0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b4 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b5 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b6 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b7 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if N0 > 8
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b8 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ b9 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bA = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bB = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bC = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bD = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bE = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ bF = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+ // Accumulate
+ ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+ }
+
+ // Left-over accumulations
+ for(; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ DATA_TYPE a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+ DATA_TYPE a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+ DATA_TYPE a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+ DATA_TYPE a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+ DATA_TYPE a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+ DATA_TYPE a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+ DATA_TYPE a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+ DATA_TYPE a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+ // Load values from RHS matrix
+ DATA_TYPE b0 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE b1 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#if N0 > 2
+ DATA_TYPE b2 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 2
+#if N0 > 3
+ DATA_TYPE b3 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 3
+#if N0 > 4
+ DATA_TYPE b4 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE b5 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE b6 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE b7 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 4
+#if N0 > 8
+ DATA_TYPE b8 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE b9 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE bA = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE bB = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE bC = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE bD = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE bE = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ DATA_TYPE bF = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+#endif // N0 > 8
+
+ // Accumulate
+ ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+ lhs_offset += sizeof(DATA_TYPE);
+ rhs_offset += sizeof(DATA_TYPE);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zout0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+ zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+ zout1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+ zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zout2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+ zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zout3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+ zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zout4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+ zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zout5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+ zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zout6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+ zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zout7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+ zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ c0 = c0 * (DATA_TYPE)ALPHA;
+#if M0 > 1
+ c1 = c1 * (DATA_TYPE)ALPHA;
+#endif // M0 > 1
+#if M0 > 2
+ c2 = c2 * (DATA_TYPE)ALPHA;
+#endif // M0 > 2
+#if M0 > 3
+ c3 = c3 * (DATA_TYPE)ALPHA;
+#endif // M0 > 3
+#if M0 > 4
+ c4 = c4 * (DATA_TYPE)ALPHA;
+#endif // M0 > 4
+#if M0 > 5
+ c5 = c5 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 6
+ c6 = c6 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 7
+ c7 = c7 * (DATA_TYPE)ALPHA;
+#endif // M0 > 7
+#endif // defined(ALPHA)
+
+ // Store output block
+ VSTORE(N0)
+ (c0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+ VSTORE(N0)
+ (c1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+ VSTORE(N0)
+ (c2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+ VSTORE(N0)
+ (c3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+ VSTORE(N0)
+ (c4, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+ VSTORE(N0)
+ (c5, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+ VSTORE(N0)
+ (c6, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+ VSTORE(N0)
+ (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#define VFMA(a, b, c) \
+ ({ \
+ c = fma(a, b, c); \
+ })
+
+#if M0 == 1
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ })
+#elif M0 == 2 // M0 == 2
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ })
+#elif M0 == 3 // M0 == 3
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ })
+#elif M0 == 4 // M0 == 4
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ })
+#elif M0 == 5 // M0 == 5
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ })
+#elif M0 == 6 // M0 == 6
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ })
+#elif M0 == 7 // M0 == 7
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ })
+#elif M0 == 8 // M0 == 8
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+ })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix is NOT reshaped
+ * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (i.e. -DM=52, -DN=30 and -DK=90).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zin0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+ zin0 *= (lhs_cross_plane_pad * lhs_stride_y);
+#if M0 > 1
+ zin1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+ zin1 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zin2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+ zin2 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zin3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+ zin3 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zin4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+ zin4 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zin5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+ zin5 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zin6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+ zin6 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zin7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+ zin7 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 7
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+ int i = 0;
+ for(; i <= (K - K0); i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+ LD_RHS_VFMA_M0xN0(0, a, c);
+ LD_RHS_VFMA_M0xN0(1, a, c);
+#if K0 > 2
+ LD_RHS_VFMA_M0xN0(2, a, c);
+#endif // K0 > 2
+#if K0 > 3
+ LD_RHS_VFMA_M0xN0(3, a, c);
+#endif // K0 > 3
+#if K0 > 4
+ LD_RHS_VFMA_M0xN0(4, a, c);
+ LD_RHS_VFMA_M0xN0(5, a, c);
+ LD_RHS_VFMA_M0xN0(6, a, c);
+ LD_RHS_VFMA_M0xN0(7, a, c);
+#endif // K0 > 4
+#if K0 > 8
+ LD_RHS_VFMA_M0xN0(8, a, c);
+ LD_RHS_VFMA_M0xN0(9, a, c);
+ LD_RHS_VFMA_M0xN0(A, a, c);
+ LD_RHS_VFMA_M0xN0(B, a, c);
+ LD_RHS_VFMA_M0xN0(C, a, c);
+ LD_RHS_VFMA_M0xN0(D, a, c);
+ LD_RHS_VFMA_M0xN0(E, a, c);
+ LD_RHS_VFMA_M0xN0(F, a, c);
+#endif // K0 > 8
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+ }
+
+ // Left-over accumulations
+ for(; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin));
+#endif // M0 > 7
+
+ LD_RHS_VFMA_M0xN0(0, a, c);
+
+ lhs_offset += sizeof(DATA_TYPE);
+ rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zout0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+ zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+ zout1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+ zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zout2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+ zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zout3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+ zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zout4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+ zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zout5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+ zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zout6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+ zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zout7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+ zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ c0 = c0 * (DATA_TYPE)ALPHA;
+#if M0 > 1
+ c1 = c1 * (DATA_TYPE)ALPHA;
+#endif // M0 > 1
+#if M0 > 2
+ c2 = c2 * (DATA_TYPE)ALPHA;
+#endif // M0 > 2
+#if M0 > 3
+ c3 = c3 * (DATA_TYPE)ALPHA;
+#endif // M0 > 3
+#if M0 > 4
+ c4 = c4 * (DATA_TYPE)ALPHA;
+#endif // M0 > 4
+#if M0 > 5
+ c5 = c5 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 6
+ c6 = c6 * (DATA_TYPE)ALPHA;
+#endif // M0 > 5
+#if M0 > 7
+ c7 = c7 * (DATA_TYPE)ALPHA;
+#endif // M0 > 7
+#endif // defined(ALPHA)
+
+ // Store output block
+ VSTORE(N0)
+ (c0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+ VSTORE(N0)
+ (c1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+ VSTORE(N0)
+ (c2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+ VSTORE(N0)
+ (c3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+ VSTORE(N0)
+ (c4, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+ VSTORE(N0)
+ (c5, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+ VSTORE(N0)
+ (c6, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+ VSTORE(N0)
+ (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N)
#if K0 == 2
#define ARM_DOT_K0(a, b, c) \
@@ -1248,15 +2390,19 @@
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
*
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
* @note Only the following configurations of M0, N0 and K0 are currently supported:
- * - M0 = 2, 3, 4, 5, 6, 7, 8
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
* - N0 = 2, 3, 4, 8, 16
* - K0 = 2, 3, 4, 8, 16
+ * - V0 >= 1
+ * - H0 >= 1
*
* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
@@ -1328,6 +2474,13 @@
#define RHS_STEP_LOOP (H0)
#endif // defined(RHS_INTERLEAVE)
+#if defined(DUMMY_WORK_ITEMS)
+ if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
// Compute LHS matrix address
__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
(get_global_id(2) * lhs_stride_z);
@@ -1348,11 +2501,14 @@
for(int i = 0; i < k; i += K0)
{
// Supported cases (M0, K0):
- // 2,4 - 2,8 - 2,16
- // 3,4 - 3,8 - 3,16
- // 4,4 - 4,8 - 4,16
- // 5,4 - 5,8 - 5,16
- // 6,4 - 6,8 - 6,16
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
// Load values from LHS matrix
VEC_DATA_TYPE(DATA_TYPE, K0)
a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 0 * LHS_STEP_X * sizeof(DATA_TYPE)));
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 277338b..033b4b4 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1944,7 +1944,7 @@
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0)
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
@@ -2099,10 +2099,12 @@
#error "N0 value not supported"
#endif // N0 conditions
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM data type .
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
*
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
@@ -2112,6 +2114,8 @@
* - M0 = 2, 3, 4, 5, 6, 7, 8
* - N0 = 2, 3, 4, 8, 16
* - K0 = 2, 3, 4, 8, 16
+ * - V0 >= 1
+ * - H0 >= 1
*
* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
@@ -2183,6 +2187,13 @@
#define RHS_STEP_LOOP (H0)
#endif // defined(RHS_INTERLEAVE)
+#if defined(DUMMY_WORK_ITEMS)
+ if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
// Compute LHS matrix address
__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X + (get_global_id(1) / V0) * (uint)lhs_stride_y + (get_global_id(
2)
@@ -2423,7 +2434,7 @@
}
#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices unsing the dot8 instruction.
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM8 data type using the dot8 instruction.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
*
@@ -2512,6 +2523,556 @@
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#define ARM_DOT1(a, b, c) \
+ ({ \
+ ARM_DOT((uchar4)(a, (uchar3)0), (uchar4)(b, (uchar3)0), c); \
+ })
+#define ARM_DOT2(a, b, c) \
+ ({ \
+ ARM_DOT((uchar4)(a, (uchar2)0), (uchar4)(b, (uchar2)0), c); \
+ })
+#define ARM_DOT3(a, b, c) \
+ ({ \
+ ARM_DOT((uchar4)(a, (uchar)0), (uchar4)(b, (uchar)0), c); \
+ })
+#define ARM_DOT4(a, b, c) \
+ ({ \
+ ARM_DOT(a, b, c); \
+ })
+#define ARM_DOT8(a, b, c) \
+ ({ \
+ ARM_DOT4((a.lo), (b.lo), c); \
+ ARM_DOT4((a.hi), (b.hi), c); \
+ })
+#define ARM_DOT16(a, b, c) \
+ ({ \
+ ARM_DOT8((a.lo), (b.lo), c); \
+ ARM_DOT8((a.hi), (b.hi), c); \
+ })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#define ARM_DOT1(a, b, c) \
+ ({ \
+ c += (uint)a.s0 * b.s0; \
+ })
+#define ARM_DOT2(a, b, c) \
+ ({ \
+ ARM_DOT1(a, b, c); \
+ c += (uint)a.s1 * b.s1; \
+ })
+#define ARM_DOT3(a, b, c) \
+ ({ \
+ ARM_DOT2(a, b, c); \
+ c += (uint)a.s2 * b.s2; \
+ })
+#define ARM_DOT4(a, b, c) \
+ ({ \
+ ARM_DOT3(a, b, c); \
+ c += (uint)a.s3 * b.s3; \
+ })
+#define ARM_DOT8(a, b, c) \
+ ({ \
+ ARM_DOT4((a.lo), (b.lo), c); \
+ ARM_DOT4((a.hi), (b.hi), c); \
+ })
+#define ARM_DOT16(a, b, c) \
+ ({ \
+ ARM_DOT8((a.lo), (b.lo), c); \
+ ARM_DOT8((a.hi), (b.hi), c); \
+ })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##4), (c.s4)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##5), (c.s5)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##6), (c.s6)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##7), (c.s7)); \
+ })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##4), (c.s4)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##5), (c.s5)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##6), (c.s6)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##7), (c.s7)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##8), (c.s8)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##9), (c.s9)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##A), (c.sA)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##B), (c.sB)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##C), (c.sC)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##D), (c.sD)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##E), (c.sE)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##F), (c.sF)); \
+ })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix is NOT reshaped
+ * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zin0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
+ zin0 *= (lhs_cross_plane_pad * lhs_stride_y);
+#if M0 > 1
+ zin1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
+ zin1 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zin2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
+ zin2 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zin3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
+ zin3 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zin4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
+ zin4 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zin5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
+ zin5 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zin6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
+ zin6 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zin7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zin7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+ zin7 *= (lhs_cross_plane_pad * lhs_stride_y);
+#endif // M0 > 7
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+ for(int i = 0; i < K; i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(uchar, K0)
+ a0 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0);
+#if M0 > 1
+ VEC_DATA_TYPE(uchar, K0)
+ a1 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1);
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(uchar, K0)
+ a2 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2);
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(uchar, K0)
+ a3 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3);
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(uchar, K0)
+ a4 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4);
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(uchar, K0)
+ a5 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5);
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(uchar, K0)
+ a6 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6);
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(uchar, K0)
+ a7 = VLOAD(K0)(0, lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7);
+#endif // M0 > 7
+
+ // Load values from RHS matrix
+ VEC_DATA_TYPE(uchar, K0)
+ b0 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 0 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b1 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 1 * RHS_STEP_X);
+#if N0 > 2
+ VEC_DATA_TYPE(uchar, K0)
+ b2 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 2 * RHS_STEP_X);
+#endif // N0 > 2
+#if N0 > 3
+ VEC_DATA_TYPE(uchar, K0)
+ b3 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 3 * RHS_STEP_X);
+#endif // N0 > 3
+#if N0 > 4
+ VEC_DATA_TYPE(uchar, K0)
+ b4 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 4 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b5 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 5 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b6 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 6 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b7 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 7 * RHS_STEP_X);
+#endif // N0 > 4
+#if N0 > 8
+ VEC_DATA_TYPE(uchar, K0)
+ b8 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 8 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ b9 = VLOAD(K0)(0, rhs_ptr + rhs_offset + 9 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bA = VLOAD(K0)(0, rhs_ptr + rhs_offset + 10 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bB = VLOAD(K0)(0, rhs_ptr + rhs_offset + 11 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bC = VLOAD(K0)(0, rhs_ptr + rhs_offset + 12 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bD = VLOAD(K0)(0, rhs_ptr + rhs_offset + 13 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bE = VLOAD(K0)(0, rhs_ptr + rhs_offset + 14 * RHS_STEP_X);
+ VEC_DATA_TYPE(uchar, K0)
+ bF = VLOAD(K0)(0, rhs_ptr + rhs_offset + 15 * RHS_STEP_X);
+#endif // N0 > 8
+
+ // Accumulate
+ ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+ lhs_offset += K0;
+ rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ zout0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
+ zout0 *= (dst_cross_plane_pad * dst_stride_y);
+#if M0 > 1
+ zout1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
+ zout1 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 1
+#if M0 > 2
+ zout2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
+ zout2 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 2
+#if M0 > 3
+ zout3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
+ zout3 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 3
+#if M0 > 4
+ zout4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
+ zout4 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 4
+#if M0 > 5
+ zout5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
+ zout5 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 5
+#if M0 > 6
+ zout6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
+ zout6 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 6
+#if M0 > 7
+ zout7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
+ zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
+ zout7 *= (dst_cross_plane_pad * dst_stride_y);
+#endif // M0 > 7
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Store output block
+ VSTORE(N0)
+ (CONVERT_SAT(c0, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout0));
+#if M0 > 1
+ VSTORE(N0)
+ (CONVERT_SAT(c1, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout1));
+#endif // M0 > 1
+#if M0 > 2
+ VSTORE(N0)
+ (CONVERT_SAT(c2, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout2));
+#endif // M0 > 2
+#if M0 > 3
+ VSTORE(N0)
+ (CONVERT_SAT(c3, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout3));
+#endif // M0 > 3
+#if M0 > 4
+ VSTORE(N0)
+ (CONVERT_SAT(c4, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 4 * dst_stride_y + zout4));
+#endif // M0 > 4
+#if M0 > 5
+ VSTORE(N0)
+ (CONVERT_SAT(c5, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 5 * dst_stride_y + zout5));
+#endif // M0 > 5
+#if M0 > 6
+ VSTORE(N0)
+ (CONVERT_SAT(c6, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 6 * dst_stride_y + zout6));
+#endif // M0 > 6
+#if M0 > 7
+ VSTORE(N0)
+ (CONVERT_SAT(c7, VEC_DATA_TYPE(int, N0)), 0, (__global int *)(dst_addr + 7 * dst_stride_y + zout7));
+#endif // M0 > 7
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)
+
#if defined(COLS_A)
/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
*
@@ -3445,4 +4006,4 @@
// Store the result
vstore4(res, 0, dst_addr);
}
-#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
index 5f66efb..70b8b36 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/l2_normalize.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,26 +31,32 @@
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along X processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] sum_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] sum_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] epsilon Epsilon value
*/
__kernel void l2_normalize_x(
- VECTOR_DECLARATION(src),
- VECTOR_DECLARATION(sum),
- VECTOR_DECLARATION(dst),
+ IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(sum),
+ IMAGE_DECLARATION(dst),
DATA_TYPE epsilon)
{
- Vector src = CONVERT_TO_VECTOR_STRUCT(src);
- Vector sum = CONVERT_TO_VECTOR_STRUCT(sum);
- Vector dst = CONVERT_TO_VECTOR_STRUCT(dst);
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
VEC_DATA_TYPE(DATA_TYPE, 16)
in = vload16(0, (__global DATA_TYPE *)src.ptr);
diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/memset.cl
index 80b34eb..7d8e0ef 100644
--- a/src/core/CL/cl_kernels/memset.cl
+++ b/src/core/CL/cl_kernels/memset.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,24 +41,27 @@
* @param[in] value The value used to fill the pages of the tensor
*/
__kernel void memset(
- IMAGE_DECLARATION(tensor))
+ TENSOR3D_DECLARATION(tensor))
{
- Image tensor = CONVERT_TO_IMAGE_STRUCT(tensor);
+ Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
// Check if access on width gets out of bounds
// If it does shift access vector to access elements within bounds
const int xi = (int)(get_global_id(0) * VEC_SIZE);
tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
data = (DATA_TYPE)(CONSTANT_VALUE);
VSTORE(VEC_SIZE)
(data, 0, (__global DATA_TYPE *)tensor.ptr);
-#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+#else // !defined(VEC_SIZE)
*((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+#endif // defined(VEC_SIZE)
}
#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index 9fa540e..d0e04b2 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -94,4 +94,52 @@
// Store result
vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
}
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
\ No newline at end of file
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
+
+/** Performs a pixelwise multiplication of complex float values
+ *
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pixelwise_mul_complex(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load data
+ float2 vin1 = vload2(0, (__global float *)in1.ptr);
+ float2 vin2 = vload2(0, (__global float *)in2.ptr);
+
+ // Perform complex multiplication
+ float2 res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
+
+ // Store result
+ vstore2(res, 0, (__global float *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 198250b..2df22d7 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,21 @@
*/
#include "helpers.h"
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) \
+ VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR(VEC_SIZE) VEC_DATA_TYPE(uchar, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res) \
+ { \
+ const VEC_FLOAT(VEC_SIZE) in_f32 = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+ const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset)); \
+ res = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_UCHAR(VEC_SIZE)); \
+ }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
#if defined(POOL_AVG)
#define POOL_OP(x, y) ((x) + (y))
#else /* defined(POOL_AVG) */
@@ -118,8 +133,22 @@
res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
#endif /* defined(POOL_AVG) */
- // Store result
- *(__global uchar *)output.ptr = convert_uchar(res);
+ uchar result_u8 = convert_uchar(res);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+ const float result_f32 = convert_float(result_u8);
+ const float input_offset = (float)OFFSET_IN1;
+ const float input_scale = (float)SCALE_IN1;
+ const float scale_out = (float)SCALE_OUT;
+ const float offset_out = (float)OFFSET_OUT;
+ const float in_f32 = (result_f32 - input_offset) * input_scale;
+ const float out_f32 = in_f32 / scale_out + offset_out;
+ result_u8 = convert_uchar_sat(convert_int_rte(out_f32));
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+ *(__global uchar *)output.ptr = result_u8;
}
int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
@@ -217,6 +246,11 @@
vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))));
#endif /* defined(POOL_AVG) */
+ uchar8 out_u8 = convert_uchar8(vdata);
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+ REQUANTIZE(8, out_u8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_u8);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
// Store result
- vstore8(convert_uchar8(vdata), 0, (__global uchar *)output.ptr);
-}
\ No newline at end of file
+ vstore8(out_u8, 0, (__global uchar *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/quantization_layer.cl
index 80ea540..7ae34ef 100644
--- a/src/core/CL/cl_kernels/quantization_layer.cl
+++ b/src/core/CL/cl_kernels/quantization_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,53 +23,63 @@
*/
#include "helpers.h"
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
+#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
+
/** This performs the quantization of floating point inputs to 8-bit unsigned integers.
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: U8
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] min_max_ptr Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
- * @param[in] min_max_stride_x Stride of the min/max vector in X dimension (in bytes)
- * @param[in] min_max_step_x min_max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] min_max_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: U8
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void quantization_layer(
TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output),
- VECTOR_DECLARATION(min_max))
+ TENSOR3D_DECLARATION(output))
{
// Get pixels pointer
Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
- // min_max_value.s0 = min, min_max_value.s1 = max
- const float2 min_max_value = vload2(0, (__global float *)(min_max_ptr + min_max_offset_first_element_in_bytes));
-
- const float4 vmin = (float4)min_max_value.s0;
- const float4 vrange = (float4)(min_max_value.s1 - min_max_value.s0);
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+ output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
// Load data
- float4 data = vload4(0, (__global float *)input.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
- // Map float values to range [0.0, 1.0]
- data = (data - vmin) / vrange;
+ // Create scale and offset vectors
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) vscale = SCALE;
+ const VEC_DATA_TYPE(int, VEC_SIZE) voffset = OFFSET;
- // Quantize and saturate
- uchar4 res = convert_uchar4_sat(data * 256.0f);
+ // Quantize
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ res = CLAMP(CONVERT_RTE_VEC(val / vscale, int, VEC_SIZE) + voffset, 0, 255);
- // Store result
- vstore4(res, 0, (__global uchar *)output.ptr);
+ //Store result
+ VSTORE(VEC_SIZE)
+ (CONVERT(res, VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)output.ptr);
+#else //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+ *((__global uchar *)(output.ptr)) = (uchar)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, 0, 255);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
}
+#endif //defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index b4ede25..2651123 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -307,6 +307,10 @@
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(COMPLEX)
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ res1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#endif // defined(COMPLEX)
#if defined(SUM_SQUARE)
res *= res;
#endif // defined(SUM_SQUARE)
@@ -320,6 +324,11 @@
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(COMPLEX)
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ in1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#endif // defined(COMPLEX)
+
#if defined(ARG_MAX)
uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
indx = select(indx, z, cond_conv);
@@ -334,8 +343,11 @@
#endif // defined(SUM_SQUARE)
#if defined(PROD)
res *= in;
-#else //!defined(PROD)
+#else //!defined(PROD)
res += in;
+#if defined(COMPLEX)
+ res1 += in1;
+#endif // defined(COMPLEX)
#endif //defined(PROD)
#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
@@ -348,6 +360,9 @@
res /= DEPTH;
#endif // defined(MEAN)
vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+#if defined(COMPLEX)
+ vstore16(CONVERT(res1, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)tensor3D_offset(&output, 8, 0, 0));
+#endif // defined(COMPLEX)
#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
#endif /* defined(DEPTH) */
diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/winograd_filter_transform.cl
index 3b9b1e9..3f203b8 100644
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_filter_transform.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,18 @@
#if defined(SRC_DIM_Z)
+#define OUTPUT_ROW_2x2_7x7(out, tmp) \
+ ({ \
+ out.s0 = -tmp.s0 / 36.f; \
+ out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f; \
+ out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f; \
+ out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f; \
+ out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f; \
+ out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+ out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+ out.s7 = tmp.s6; \
+ })
+
/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
@@ -1045,6 +1057,306 @@
*(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
}
+/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_7x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Load the values from the input tensor
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Load the values from the input tensor
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+ DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+ DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));
+
+ DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));
+ DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ tmp = 0.0f;
+
+ // Row 0
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+
+ out0.s0 = -w00 / 36.0f;
+ out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;
+ out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;
+ out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;
+ out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;
+ out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;
+ out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;
+ out0.s7 = w06;
+
+ out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ // Row 1
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+
+ tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;
+ tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;
+ tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;
+ tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;
+ tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;
+ tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;
+ tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;
+
+ OUTPUT_ROW_2x2_7x7(out1, tmp);
+
+ // Row 2
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+
+ tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;
+ tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;
+ tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;
+ tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;
+ tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;
+ tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;
+ tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;
+
+ OUTPUT_ROW_2x2_7x7(out2, tmp);
+
+ // Row 3
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+
+ tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;
+ tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;
+ tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;
+ tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;
+ tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;
+ tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;
+ tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;
+
+ OUTPUT_ROW_2x2_7x7(out3, tmp);
+
+ // Row 4
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+
+ tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;
+ tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;
+ tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;
+ tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;
+ tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;
+ tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;
+ tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;
+
+ OUTPUT_ROW_2x2_7x7(out4, tmp);
+
+ // Row 5
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+
+ tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;
+ tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;
+ tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;
+ tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;
+ tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;
+ tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;
+ tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;
+
+ OUTPUT_ROW_2x2_7x7(out5, tmp);
+
+ // Row 6
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out6 = 0.0f;
+
+ tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;
+ tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;
+ tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;
+ tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;
+ tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;
+ tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;
+ tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;
+
+ OUTPUT_ROW_2x2_7x7(out6, tmp);
+
+ // Row 7
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out7 = 0.0f;
+
+ tmp.s0 = w60;
+ tmp.s1 = w61;
+ tmp.s2 = w62;
+ tmp.s3 = w63;
+ tmp.s4 = w64;
+ tmp.s5 = w65;
+ tmp.s6 = w66;
+
+ OUTPUT_ROW_2x2_7x7(out7, tmp);
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int x0 = get_global_id(2); // idx filter
+ int y0 = get_global_id(0); // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+ *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+ *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+ *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+ *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+ *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+ *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+ *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+ *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+ *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+ *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+ *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+ *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+ *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+ *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+ *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+ *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+ *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+ *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+ *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+ *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
#endif // defined(SRC_DIM_Z)
#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
@@ -1292,6 +1604,55 @@
dst_step_z,
dst_offset_first_element_in_bytes);
}
+
+/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_7x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
@@ -1539,4 +1900,53 @@
dst_step_z,
dst_offset_first_element_in_bytes);
}
+
+/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index 34bf290..630a78b 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,6 +43,24 @@
out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
})
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
+ ({ \
+ comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \
+ comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \
+ comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \
+ out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \
+ out.s1 = comm_fact.s0 - comm_fact.s1; \
+ out.s2 = comm_fact.s0 + comm_fact.s1; \
+ out.s3 = comm_fact.s2 - comm_fact.s3; \
+ out.s4 = comm_fact.s2 + comm_fact.s3; \
+ out.s5 = comm_fact.s4 - comm_fact.s5; \
+ out.s6 = comm_fact.s4 + comm_fact.s5; \
+ out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+ })
+
#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
*
@@ -85,7 +103,7 @@
const int z = get_global_id(2) % SRC_DEPTH;
const int b = get_global_id(2) / SRC_DEPTH;
#else /* defined(SRC_DEPTH) */
- const int z = get_global_id(2);
+ const int z = get_global_id(2);
#endif /* defined(SRC_DEPTH) */
// Compute input address
@@ -221,7 +239,7 @@
const int z = (get_global_id(2) * 2) % SRC_DEPTH;
const int b = (get_global_id(2) * 2) / SRC_DEPTH;
#else /* defined(SRC_DEPTH) */
- const int z = get_global_id(2) * 2;
+ const int z = get_global_id(2) * 2;
#endif /* defined(SRC_DEPTH) */
// Compute input address
@@ -403,7 +421,7 @@
const int z = get_global_id(2) % SRC_DEPTH;
const int b = get_global_id(2) / SRC_DEPTH;
#else /* defined(SRC_DEPTH) */
- const int z = get_global_id(2);
+ const int z = get_global_id(2);
#endif /* defined(SRC_DEPTH) */
// Compute input address
@@ -430,7 +448,7 @@
VEC_DATA_TYPE(DATA_TYPE, 4)
d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
VEC_DATA_TYPE(DATA_TYPE, 2)
- d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
DATA_TYPE out0 = 0.0f;
@@ -495,7 +513,7 @@
#if defined(SRC_DEPTH)
__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
#else /* defined(SRC_DEPTH) */
- __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+ __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
#endif /* defined(SRC_DEPTH) */
uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
@@ -728,14 +746,14 @@
const int z = get_global_id(2) % SRC_DEPTH;
const int b = get_global_id(2) / SRC_DEPTH;
#else /* defined(SRC_DEPTH) */
- const int z = get_global_id(2);
+ const int z = get_global_id(2);
#endif /* defined(SRC_DEPTH) */
// Compute input address
#if defined(SRC_DEPTH)
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
#else /* defined(SRC_DEPTH) */
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
#endif /* defined(SRC_DEPTH) */
src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
@@ -933,7 +951,7 @@
const int z = get_global_id(2) % NUM_TILES_Y;
const int b = get_global_id(2) / NUM_TILES_Y;
#else /* defined(NUM_TILES_Y) */
- const int z = get_global_id(2);
+ const int z = get_global_id(2);
#endif /* defined(NUM_TILES_Y) */
#if defined(NUM_TILES_Y)
@@ -1010,8 +1028,8 @@
DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
#else // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- int4 z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
- int2 z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
+ int4 z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
+ int2 z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
valid_y0 = select((int4)y_coord0.s0, (int4) - 1, z_coords0 < (int4)0);
valid_y1 = select((int2)y_coord0.s0, (int2) - 1, z_coords1 < (int2)0);
@@ -1021,12 +1039,12 @@
z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
- DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
- DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
- DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
- DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
- DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
- DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
+ DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
+ DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
+ DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
+ DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
+ DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
+ DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
@@ -1096,7 +1114,7 @@
#if defined(NUM_TILES_Y)
__global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
#else /* defined(NUM_TILES_Y) */
- __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
+ __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
#endif /* defined(NUM_TILES_Y) */
uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
@@ -1333,14 +1351,14 @@
const int z = get_global_id(2) % NUM_TILES_Y;
const int b = get_global_id(2) / NUM_TILES_Y;
#else /* defined(NUM_TILES_Y) */
- const int z = get_global_id(2);
+ const int z = get_global_id(2);
#endif /* defined(NUM_TILES_Y) */
// Compute input address
#if defined(NUM_TILES_Y)
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
#else /* defined(NUM_TILES_Y) */
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
#endif /* defined(NUM_TILES_Y) */
#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
@@ -1573,6 +1591,370 @@
OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ // Store values across the channels
+#if defined(NUM_TILES_Y)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else /* NUM_TILES_Y */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* NUM_TILES_Y */
+
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+ *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+ *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+ *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+ *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
+ *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
+ *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+ *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+ *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+ *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+ *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+ *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+ *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+ *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+ *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+ *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+ *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+ *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+ *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+ *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+ *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+ *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+ *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+ *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+ *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+ *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+ *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+ *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+ *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+ *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+ *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+ *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+ *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+ *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+ *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+ *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+ *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+ *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+ *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+ *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+ *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+ *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+ *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+ *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+ *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+ *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+ *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+ *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+ *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+ *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+ *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+ *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+ *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+ *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+ *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+ *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+ *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+ *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+ *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+ *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 7x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x7, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
+ const int z = get_global_id(2) % NUM_TILES_Y;
+ const int b = get_global_id(2) / NUM_TILES_Y;
+#else /* defined(NUM_TILES_Y) */
+ const int z = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
+
+ // Compute input address
+#if defined(NUM_TILES_Y)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else /* defined(NUM_TILES_Y) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+ // Clamp coordinates. This clamp is valid for all rows
+ int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+ y_coord = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+
+ // Clamp coordinates. This clamp is valid for all columns
+ int z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
+ int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0); // If z < 0, set y to -1
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+ // Load the input tile
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ in_row0;
+ in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+ OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
+
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ // We can skip the border clamping along the y dimension as we cannot read out-of-bound in case of 1x5 kernels
+ int y_coord = y * (int)OUTPUT_TILE_W;
+
+ // Row0
+ // We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
+ int8 z_coord = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
+ int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0); // If z < 0, set y to -1
+ valid_y = select(valid_y, (int8)SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+ z_coord = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1); // Clamp z coordinate
+
+ // Load the input tile
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ in_row0;
+ in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * (int)src_stride_z);
+ in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * (int)src_stride_z);
+ in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * (int)src_stride_z);
+ in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * (int)src_stride_z);
+ in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * (int)src_stride_z);
+ in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * (int)src_stride_z);
+ in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * (int)src_stride_z);
+ in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * (int)src_stride_z);
+
+ // Calculate common factors for intermediate tensor
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ tmp0 = ((VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.0f) * in_row0;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
+
+ OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
+
+ // Clamp coordinates. This clamp is valid for all rows
+ int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+ y_coord = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+
+ // Row0
+ int z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
+ int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0); // If z < 0, set y to -1
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1); // Clamp z coordinate
+
+ // Load the input tile
+ in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+ // Row1
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+ in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+ // Row2
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+ in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+ // Row3
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+ in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+ // Row4
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+ in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+ // Row5
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+ in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+ // Row6
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+ in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+ // Row7
+ z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;
+ valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
+ valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
+ z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+
+ in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
+ in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact0 = (DATA_TYPE)36.0f * in_row2 - (DATA_TYPE)13.0f * in_row4 + in_row6;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact1 = (DATA_TYPE)36.0f * in_row1 - (DATA_TYPE)13.0f * in_row3 + in_row5;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact2 = (DATA_TYPE)9.0f * in_row2 - (DATA_TYPE)10.0f * in_row4 + in_row6;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact3 = (DATA_TYPE)18.0f * in_row1 - (DATA_TYPE)20.0f * in_row3 + (DATA_TYPE)2.0f * in_row5;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact4 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact5 = (DATA_TYPE)12.0f * in_row1 - (DATA_TYPE)15.0f * in_row3 + (DATA_TYPE)3.0f * in_row5;
+
+ // Calculate intermediate tensors
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = -(DATA_TYPE)36.0f * in_row0 + (DATA_TYPE)49.0f * in_row2 - (DATA_TYPE)14.0f * in_row4 + in_row6;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 - comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 + comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact2 - comm_fact3;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 + comm_fact3;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact4 - comm_fact5;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact4 + comm_fact5;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = -(DATA_TYPE)36.0f * in_row1 + (DATA_TYPE)49.0f * in_row3 - (DATA_TYPE)14.0f * in_row5 + in_row7;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0, out1, out2, out3, out4, out5, out6, out7;
+
+ OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
+ OUTPUT_ROW_2x2_7x7(out1, tmp1, comm_fact0);
+ OUTPUT_ROW_2x2_7x7(out2, tmp2, comm_fact0);
+ OUTPUT_ROW_2x2_7x7(out3, tmp3, comm_fact0);
+ OUTPUT_ROW_2x2_7x7(out4, tmp4, comm_fact0);
+ OUTPUT_ROW_2x2_7x7(out5, tmp5, comm_fact0);
+ OUTPUT_ROW_2x2_7x7(out6, tmp6, comm_fact0);
+ OUTPUT_ROW_2x2_7x7(out7, tmp7, comm_fact0);
+
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Store values across the channels
@@ -1981,6 +2363,62 @@
src_stride_w,
dst_stride_w);
}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=7
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
@@ -2313,6 +2751,62 @@
src_stride_w,
dst_stride_w);
}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=7).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=7
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
+}
#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
\ No newline at end of file
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index e979978..cffc12d 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -180,6 +180,240 @@
vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
+
+#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
+ ({ \
+ col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \
+ col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
+ })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+ int y_in = get_global_id(1);
+ int x_out = get_global_id(0);
+ int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+ int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+ // Load the values across the channels to compose the input tile
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Compute out00, out01, out02 and out03
+ float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
+ float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+ out00 += (float)b;
+ out01 += (float)b;
+#endif // defined(HAS_BIAS)
+
+ // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Get output address
+#if defined(SRC_DEPTH)
+ int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+ offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
+ *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Get output address
+ int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
+ *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+ DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+ DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+ DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+ DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+ DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+ DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+ DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+ DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+ DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+ DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+ DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+ DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+ DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+ DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+ DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+ DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+ DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+ DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+ DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+ DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+ DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+ DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+ DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+ DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+ DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+ DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+ DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+ DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+ DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+ DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+ DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+ DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+ DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+ DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+ DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+ DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+ DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+ DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+ DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+ DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+ DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+ // Compute the 8x2 intermediate tensor
+ VEC_DATA_TYPE(float, 2)
+ tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
+
+ // Compute the 2x2 output tile
+ VEC_DATA_TYPE(float, 2)
+ out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
+ VEC_DATA_TYPE(float, 2)
+ out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+ out_col0 += (VEC_DATA_TYPE(float, 2))b;
+ out_col1 += (VEC_DATA_TYPE(float, 2))b;
+
+#endif // defined(HAS_BIAS)
+ // Get output address
+#if defined(SRC_DEPTH)
+ int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+ offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+ int2 mult_y = min((int2)dst_size - offset, (int2)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+
+ // Store the output tile
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
+
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
+
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
#if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -227,8 +461,8 @@
Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
#else /* defined(SRC_DEPTH) */
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
#endif /* defined(SRC_DEPTH) */
// Load the values across the channels to compose the 6x6 or 6x1 tile
@@ -599,7 +833,7 @@
#if defined(SRC_DEPTH)
int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
#else /* defined(SRC_DEPTH) */
- int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
#endif /* defined(SRC_DEPTH) */
offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
@@ -1231,6 +1465,72 @@
#endif // defined(HAS_BIAS)
);
}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
#if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -1573,6 +1873,72 @@
#endif // defined(HAS_BIAS)
);
}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
#if defined(VEC_SIZE) && VEC_SIZE == 4
diff --git a/src/core/CL/gemm/CLGEMMHelpers.cpp b/src/core/CL/gemm/CLGEMMHelpers.cpp
new file mode 100644
index 0000000..4597d79
--- /dev/null
+++ b/src/core/CL/gemm/CLGEMMHelpers.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+ bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose)
+{
+ GEMMLHSMatrixInfo lhs_info;
+ GEMMRHSMatrixInfo rhs_info;
+
+ // Configure GEMMLHSMatrixInfo
+ lhs_info.m0 = m0;
+ lhs_info.k0 = k0;
+ lhs_info.v0 = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
+ lhs_info.interleave = lhs_interleave;
+ lhs_info.transpose = lhs_transpose;
+
+ // Configure GEMMRHSMatrixInfo
+ rhs_info.n0 = n0;
+ rhs_info.k0 = lhs_info.k0;
+ rhs_info.h0 = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
+ rhs_info.interleave = rhs_interleave;
+ rhs_info.transpose = rhs_transpose;
+
+ return std::make_pair(lhs_info, rhs_info);
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
new file mode 100644
index 0000000..b791c1c
--- /dev/null
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include <map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+CLGEMMReshapedKernelConfigurationBifrost::CLGEMMReshapedKernelConfigurationBifrost(GPUTarget arch)
+ : ICLGEMMKernelConfiguration(arch)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+ ARM_COMPUTE_UNUSED(data_type);
+
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedKernelConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ // Configurations for Mali-G76
+ static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G76 =
+ {
+ { DataType::F32, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f32 },
+ { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8 }
+ };
+
+ // Configurations for Mali-G7x
+ static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G7x =
+ {
+ { DataType::F32, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_f32 },
+ { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 }
+ };
+
+ switch(_target)
+ {
+ case GPUTarget::G76:
+ return (this->*gemm_configs_G76[data_type])(m, n, k, b);
+ default:
+ return (this->*gemm_configs_G7x[data_type])(m, n, k, b);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(dot8_supported(CLKernelLibrary::get().get_device()))
+ {
+ if(n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true);
+ }
+ }
+ else
+ {
+ if(n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(n <= 4)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true);
+ }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
new file mode 100644
index 0000000..483bab8
--- /dev/null
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include <map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget arch)
+ : ICLGEMMKernelConfiguration(arch)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+ ARM_COMPUTE_UNUSED(data_type);
+
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
+ unsigned int b);
+
+ // Configurations for Mali-G76
+ static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G76 =
+ {
+ { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f32 },
+ { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8 }
+ };
+
+ // Configurations for Mali-G7x
+ static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G7x =
+ {
+ { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f32 },
+ { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 }
+ };
+
+ switch(_target)
+ {
+ case GPUTarget::G76:
+ return (this->*gemm_configs_G76[data_type])(m, n, k, b);
+ default:
+ return (this->*gemm_configs_G7x[data_type])(m, n, k, b);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(m == 1)
+ {
+ if(n > 2048)
+ {
+ const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+ return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+ }
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(m == 1)
+ {
+ const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+ return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(dot8_supported(CLKernelLibrary::get().get_device()))
+ {
+ if(m == 1)
+ {
+ const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true);
+ }
+ }
+ else
+ {
+ if(m == 1)
+ {
+ const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+ return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+ return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, h0, false, true, false, true);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(m == 1)
+ {
+ const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+ return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true);
+ }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index 58a8d10..aa06d3a 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,7 +66,7 @@
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_x * input->tensor_shape()[idx_height]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_y * input->tensor_shape()[idx_height]));
ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index f5f5a0f..4f44851 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -219,6 +219,6 @@
const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
+ return BorderSize{ 0, border, 0, 0 };
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
index e677793..d9c7ede 100644
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,9 +39,12 @@
#include <sstream>
#include <string>
-using namespace arm_compute;
-
-#define MAX_MATRIX_SIZE 81
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int max_matrix_size = 81;
+} // namespace
/****************************************************************************************\
* Square Convolution *
@@ -138,8 +141,8 @@
// Set build options
std::set<std::string> build_opts;
- int16_t mat[matrix_size * matrix_size] = { 0 };
- memcpy(mat, conv, matrix_size * sizeof(int16_t));
+ std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
+ memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
{
@@ -173,7 +176,7 @@
template <unsigned int matrix_size>
BorderSize CLSeparableConvolutionVertKernel<matrix_size>::border_size() const
{
- return BorderSize(matrix_size / 2, 0);
+ return BorderSize{ matrix_size / 2, 0 };
}
template <unsigned int matrix_size>
@@ -190,8 +193,8 @@
std::set<std::string> build_opts;
- int16_t mat[matrix_size * matrix_size] = { 0 };
- memcpy(mat + matrix_size, conv, matrix_size * sizeof(int16_t));
+ std::array<int16_t, matrix_size *matrix_size> mat = { 0 };
+ memcpy(mat.data() + matrix_size, conv, matrix_size * sizeof(int16_t));
for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
{
@@ -264,11 +267,11 @@
uint32_t matrix_size = width * height;
- int16_t mat[MAX_MATRIX_SIZE] = { 0 };
+ std::array<int16_t, max_matrix_size> mat = { 0 };
- memcpy(mat, conv, matrix_size * sizeof(int16_t));
+ memcpy(mat.data(), conv, matrix_size * sizeof(int16_t));
- for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++)
+ for(unsigned int j = 0; j < max_matrix_size; j++)
{
options.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
}
@@ -328,3 +331,4 @@
template class arm_compute::CLSeparableConvolutionHorKernel<5>;
template class arm_compute::CLSeparableConvolutionHorKernel<7>;
template class arm_compute::CLSeparableConvolutionHorKernel<9>;
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
index e14e5da..c87768a 100644
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ b/src/core/CL/kernels/CLCopyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,38 +37,57 @@
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList())
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_ON(!padding.empty() && output_window != nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
// Validate output if initialized
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding), output->tensor_shape());
+ if(output_window == nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding), output->tensor_shape());
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output_window->shape());
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, Window *output_window)
{
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, *input);
// Configure window
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+ const unsigned int vec_size_x = 16 / input->element_size();
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ if(output_window == nullptr)
+ {
+ // Create and update the window (if needed)
+ Window win = calculate_max_window(*input, Steps(vec_size_x));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input_access(input, 0, vec_size_x);
+ AccessWindowHorizontal output_access(output, 0, vec_size_x);
- bool window_changed = update_window_and_padding(win, input_access, output_access);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+ }
+ else
+ {
+ Window win = calculate_max_window(*input);
+ return std::make_pair(Status{}, win);
+ }
}
std::pair<Status, Window> validate_and_configure_window_with_padding(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding)
@@ -131,14 +150,14 @@
} // namespace
CLCopyKernel::CLCopyKernel()
- : _input(nullptr), _output(nullptr)
+ : _input(nullptr), _output(nullptr), _output_window(), _has_output_window(false)
{
}
-void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, Window *output_window)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, output_window));
_input = input;
_output = output;
@@ -147,21 +166,44 @@
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-
std::pair<Status, Window> win_config;
+ const unsigned int vec_size_x = 16 / input->info()->element_size();
+
if(padding.empty())
{
+ // Configure window
+ win_config = validate_and_configure_window(input->info(), output->info(), output_window);
+
+ if(output_window != nullptr)
+ {
+ _has_output_window = true;
+ _output_window = Window(*output_window);
+ const int width_x = output_window->num_iterations(0);
+ const bool multi_access_x = width_x >= static_cast<int32_t>(vec_size_x);
+ const bool remainder_x = width_x % vec_size_x > 0;
+
+ if(multi_access_x)
+ {
+ _output_window.set(Window::DimX, Window::Dimension(output_window->x().start(), ceil_to_multiple(output_window->x().end(), vec_size_x), vec_size_x));
+ win_config.second.set(Window::DimX, Window::Dimension(win_config.second.x().start(), ceil_to_multiple(win_config.second.x().end(), vec_size_x), vec_size_x));
+ }
+
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(width_x - vec_size_x, 0)));
+ }
+ else
+ {
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ }
+
// Build kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_tensor", build_opts.options()));
-
- // Configure window
- win_config = validate_and_configure_window(input->info(), output->info());
}
else
{
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+
// Add compile time options
add_padding_as_build_options(padding, build_opts);
@@ -185,13 +227,13 @@
ICLKernel::configure_internal(win_config.second);
}
-Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, const PaddingList &padding)
+Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, const PaddingList &padding, Window *output_window)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, output_window));
if(padding.empty())
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), output_window).first);
}
else
{
@@ -206,16 +248,33 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
+ Window slice;
- do
+ if(_has_output_window)
{
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
+ slice = window.first_slice_window_3D();
+ Window out_slice = _output_window.first_slice_window_3D();
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice) && _output_window.slide_window_slice_3D(out_slice));
}
- while(collapsed.slide_window_slice_3D(slice));
+ else
+ {
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ slice = collapsed.first_slice_window_3D();
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+ }
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCropKernel.cpp b/src/core/CL/kernels/CLCropKernel.cpp
new file mode 100644
index 0000000..f8a2456
--- /dev/null
+++ b/src/core/CL/kernels/CLCropKernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCropKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <map>
+
+namespace arm_compute
+{
+CLCropKernel::CLCropKernel()
+ : _input(nullptr), _output(nullptr), _start(), _batch_index(0), _extrapolation_value(0)
+{
+}
+
+void CLCropKernel::configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *output_window)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), start, end, batch_index, extrapolation_value, output_window));
+
+ _input = input;
+ _output = output;
+ _start = start;
+ _batch_index = batch_index;
+ _extrapolation_value = extrapolation_value;
+
+ const int vec_size_x = 4;
+ // Create and update the window (if needed)
+ Window win = calculate_max_window(*output->info());
+
+ if(output_window != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *output_window);
+ win = *output_window;
+ }
+
+ const int output_width_x = win.num_iterations(0);
+ const bool multi_access_x = output_width_x >= vec_size_x;
+ const bool remainder_x = output_width_x % vec_size_x > 0;
+
+ if(multi_access_x)
+ {
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
+ build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("crop_tensor", build_opts.options()));
+}
+
+Status CLCropKernel::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *output_window)
+{
+ ARM_COMPUTE_UNUSED(extrapolation_value, output_window);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(input->dimension(1)) || start.y >= static_cast<int32_t>(input->dimension(2))
+ || end.x >= static_cast<int32_t>(input->dimension(1)) || end.y >= static_cast<int32_t>(input->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= input->dimension(3));
+ if(output_window != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output_window->x().step() != 1);
+ }
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 3);
+ }
+ return Status{};
+}
+
+void CLCropKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window in_slice = Window();
+ in_slice.use_tensor_dimensions(_input->info()->tensor_shape());
+ in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step()));
+ in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
+
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_3D_tensor_argument(idx, _output, window);
+ add_argument(idx, _start.x);
+ add_argument(idx, _start.y);
+ enqueue(queue, *this, window);
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
new file mode 100644
index 0000000..71218f5
--- /dev/null
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
+ const DataLayout data_layout = input_info->data_layout();
+ const unsigned int stride_x = deconv_info.stride().first;
+ const unsigned int stride_y = deconv_info.stride().second;
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_b = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ const bool is_qasymm = is_data_type_quantized_asymmetric(input_info->data_type());
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::S32);
+ if(!is_qasymm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
+
+ if(bias != nullptr)
+ {
+ if(is_qasymm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, input);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
+ }
+
+ if(output->total_size() != 0)
+ {
+ auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+ 0, 0, stride_x, stride_y);
+
+ const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ }
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ const DataLayout data_layout = input_info->data_layout();
+
+ const unsigned int stride_x = deconv_info.stride().first;
+ const unsigned int stride_y = deconv_info.stride().second;
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+ 0, 0, stride_x, stride_y);
+
+ const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+
+ Window win = calculate_max_window(*input);
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
+ : _add_bias(false),
+ _bias(nullptr)
+{
+}
+
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ const DataLayout data_layout = input_info->data_layout();
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_b = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ _input = input;
+ _output = output;
+ _add_bias = (bias != nullptr);
+ _bias = bias;
+
+ const int filter_w = weights_info->dimension(idx_w);
+ const int filter_h = weights_info->dimension(idx_h);
+ const int filter_b = weights_info->dimension(idx_b);
+ const int img_w = input_info->dimension(idx_w);
+ const int img_h = input_info->dimension(idx_h);
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DFILTER_WIDTH=" + support::cpp11::to_string(filter_w));
+ build_opts.add_option("-DFILTER_HEIGHT=" + support::cpp11::to_string(filter_h));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(img_w));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(img_h));
+ build_opts.add_option_if(data_layout == DataLayout::NCHW, "-DNUM_FILTERS=" + support::cpp11::to_string(filter_b));
+ build_opts.add_option_if(_add_bias, "-DADD_BIAS");
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("deconvolution_reshape", build_opts.options()));
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "deconvolution_reshape_output_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+ const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
+ return Status{};
+}
+
+void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, collapsed);
+ add_3D_tensor_argument(idx, _output, collapsed);
+ if(_add_bias)
+ {
+ add_1D_tensor_argument(idx, _bias, collapsed);
+ }
+ enqueue(queue, *this, collapsed, lws_hint());
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 3fccc04..1cae371 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,19 +47,13 @@
{
ARM_COMPUTE_UNUSED(depth_offset);
- // Configure kernel window
- const int left_right = (output->dimension(0) - input->dimension(0)) / 2;
- const int top_bottom = (output->dimension(1) - input->dimension(1)) / 2;
-
const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
- const unsigned int num_elems_read_per_iteration = 16 / input->element_size();
- const unsigned int num_rows_read_per_iteration = 1;
// The window needs to be based on input as we copy all the depths of input
Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1));
- AccessWindowRectangle input_access(input, -left_right, -top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -74,30 +68,20 @@
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) > output->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) > output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
- // The gaps between the two lowest dimensions of input and output need to be divisible by 2
- // Otherwise it is not clear how the padding should be added onto the input tensor
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) - input->dimension(0)) % 2);
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(1) - input->dimension(1)) % 2);
-
return Status{};
}
} // namespace
CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel()
- : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+ : _input(nullptr), _output(nullptr), _depth_offset(0)
{
}
-BorderSize CLDepthConcatenateLayerKernel::border_size() const
-{
- return BorderSize(_top_bottom, _left_right);
-}
-
void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -125,10 +109,6 @@
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts.options()));
// Configure kernel window
- _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
- _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
-
- // Configure kernel window
auto win_config = validate_and_configure_window(input->info(), depth_offset, output->info());
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
@@ -153,16 +133,8 @@
const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2];
- unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
- const cl_int3 offsets =
- {
- {
- static_cast<cl_int>(_left_right),
- static_cast<cl_int>(_top_bottom),
- static_cast<cl_int>(offset_to_first_elements_in_bytes),
- }
- };
- _kernel.setArg<cl_int3>(idx, offsets);
+ unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
do
{
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index eb561fa..02d8c6d 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,19 +43,21 @@
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info)
+ const ActivationLayerInfo &act_info, const Size2D dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && ((input->data_type() != DataType::QASYMM8) || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC))),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
"For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+
const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
if(biases != nullptr)
@@ -74,7 +76,7 @@
if(output->total_size() != 0)
{
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
}
@@ -82,10 +84,10 @@
}
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- GPUTarget gpu_target, std::string &kernel_name)
+ GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation)
{
// Output auto inizialitation if not yet initialized
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
const unsigned int conv_stride_x = conv_info.stride().first;
@@ -171,12 +173,17 @@
{
const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
- kernel_name = is_qasymm ? (std::string("depthwise_convolution_3x3_quantized") + (is_dot8_supported ? "_dot8" : "") + "_nchw") : "depthwise_convolution_3x3";
+ kernel_name = is_qasymm ? "dwc_3x3_native_qasymm8" : "depthwise_convolution_3x3";
+ kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : "");
+ kernel_name += (is_qasymm ? "_nchw" : "");
+
num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
- num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1) ? 2 : 1;
+ num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1 && dilation.y() == 1) ? 2 : 1;
num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
num_elems_read_per_iteration_y = num_elems_written_per_iteration_y + 2;
}
+ num_elems_read_per_iteration_x += (num_elems_read_per_iteration_x - 1) * (dilation.x() - 1);
+ num_elems_read_per_iteration_y += (num_elems_read_per_iteration_y - 1) * (dilation.y() - 1);
// Create window and update padding
Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
@@ -207,10 +214,10 @@
}
void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info)
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation));
bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
@@ -228,15 +235,18 @@
std::string kernel_name;
const GPUTarget gpu_target = get_target();
- auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name);
+ auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name, dilation);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
// Set build options
CLBuildOptions build_opts;
+ build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->tensor_shape().z()));
build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
+ build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+ build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
if(is_qasymm)
@@ -256,31 +266,31 @@
if(act_info.enabled())
{
- const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
- const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
- const int o1 = input->info()->quantization_info().offset;
+ const int a_val = output->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_val = output->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+ const int o1 = output->info()->quantization_info().offset;
- build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
- if(output != nullptr)
- {
- const float s1 = input->info()->quantization_info().scale;
- const float s2 = output->info()->quantization_info().scale;
- const int o2 = output->info()->quantization_info().offset;
-
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
- if(o1 != o2 || s1 != s2)
- {
- build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
- build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
- }
- }
+ const float s1 = input->info()->quantization_info().scale;
+ build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+ build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
}
}
+ else
+ {
+ build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+ build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+ build_opts.add_option_if(act_info.enabled(), "-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option_if(act_info.enabled(), "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(win_config.second.x().step()));
+ }
+
+ build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
+ build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
+
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Set config_id for enabling LWS tuning
@@ -300,12 +310,11 @@
}
Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- ActivationLayerInfo act_info, GPUTarget gpu_target)
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
{
std::string kernel_name;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, gpu_target, kernel_name).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, gpu_target, kernel_name, dilation).first);
return Status{};
}
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 5e5a35c..c31825c 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -42,21 +42,23 @@
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info)
+ const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && ((input->data_type() != DataType::QASYMM8) || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC))),
- "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); //COMPMID-1317 add fused activation for F32
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC),
+ "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
ARM_COMPUTE_RETURN_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+
const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
const size_t weights_width = 3;
const size_t weights_height = 3;
@@ -89,7 +91,8 @@
if(output->total_size() != 0)
{
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, weights_width, weights_height, conv_info, depth_multiplier);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+ *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
}
@@ -97,13 +100,14 @@
}
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
- const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
{
const size_t weights_width = 3;
const size_t weights_height = 3;
// Get convolved dimensions
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, weights_width, weights_height, conv_info, depth_multiplier);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+ *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output,
@@ -112,10 +116,10 @@
input->data_type(),
input->quantization_info());
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
+ const bool is_stride_1_dilation_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1) && dilation.x() == 1 && dilation.y() == 1);
- const unsigned int num_rows_processed_per_iteration = is_stride_1 ? 2 : 1;
+ const unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->element_size());
const unsigned int num_rows_read_per_iteration = num_rows_processed_per_iteration + 2;
const unsigned int num_rows_written_per_iteration = std::ceil(num_rows_processed_per_iteration / static_cast<float>(conv_info.stride().first));
@@ -138,7 +142,7 @@
}
else
{
- AccessWindowStatic weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_accessed_per_iteration), weights->dimension(1));
+ AccessWindowStatic weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_accessed_per_iteration), weights->dimension(1));
window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
}
@@ -166,15 +170,17 @@
}
void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info)
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
- auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation));
+ auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+
const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
_input = input;
@@ -182,8 +188,8 @@
_weights = weights;
_biases = biases;
_conv_stride_y = conv_info.stride().second;
- _num_rows_processed_per_iteration = is_stride_1 ? 2 : 1;
- _num_planes_processed_per_iteration = is_stride_1 ? 2 : 1;
+ _num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
+ _num_planes_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
// If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
if(is_dot8_supported && is_qasymm)
@@ -196,11 +202,14 @@
const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->info()->element_size());
CLBuildOptions build_opts;
+ build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+ build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+ build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
if(is_qasymm)
{
@@ -219,37 +228,28 @@
if(act_info.enabled())
{
- const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
- const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
- const int o1 = input->info()->quantization_info().offset;
+ const int a_val = output->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_val = output->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+ const int o1 = output->info()->quantization_info().offset;
- build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
- if(output != nullptr)
- {
- const float s1 = input->info()->quantization_info().scale;
- const float s2 = output->info()->quantization_info().scale;
- const int o2 = output->info()->quantization_info().offset;
-
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
- if(o1 != o2 || s1 != s2)
- {
- build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
- build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
- }
- }
+ const float s1 = input->info()->quantization_info().scale;
+ build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+ build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
}
}
else
{
+ build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+ build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+ build_opts.add_option_if(act_info.enabled(), "-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
}
- if(is_stride_1)
+ if(is_stride_1_dilation_1)
{
build_opts.add_option("-DNUM_ROWS_PROCESSED=" + support::cpp11::to_string(_num_rows_processed_per_iteration));
build_opts.add_option("-DNUM_PLANES_PROCESSED=" + support::cpp11::to_string(_num_planes_processed_per_iteration));
@@ -263,9 +263,24 @@
build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
"-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
+ std::string kernel_name;
// Create kernel
- std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
- && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
+ if(is_qasymm)
+ {
+ kernel_name = std::string("dwc_3x3_reshaped_qasymm8");
+ kernel_name += (is_dot8_supported && is_stride_1_dilation_1 ? "_dot8" : "");
+ kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
+ kernel_name += "_nhwc";
+ }
+ else
+ {
+ kernel_name = std::string("depthwise_convolution_3x3_nhwc");
+ kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
+ }
+
+ build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
+ build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
+
ICLKernel::configure_internal(win_config.second);
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
@@ -286,13 +301,12 @@
}
Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- ActivationLayerInfo act_info)
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
biases != nullptr ? biases->clone().get() : nullptr,
- output->clone().get(), conv_info, depth_multiplier)
+ output->clone().get(), conv_info, depth_multiplier, dilation)
.first);
return Status{};
diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index 56e9db5..28d4ff2 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
@@ -33,7 +33,6 @@
#include "arm_compute/core/Types.h"
#include "support/ToolchainSupport.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
#include <tuple>
using namespace arm_compute;
@@ -45,7 +44,8 @@
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+ const Size2D &dilation)
{
const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
@@ -56,16 +56,18 @@
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != output->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+ ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || dilation.y() < 1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
return Status{};
}
} // namespace
-void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+ const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
_input = input;
_output = output;
@@ -89,6 +91,8 @@
build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
+ build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+ build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
build_opts.add_option("-D" + string_from_data_layout(input->info()->data_layout()));
build_opts.add_option_if(has_bias, "-DHAS_BIAS");
build_opts.add_option_if_else(is_data_type_quantized_asymmetric(input->info()->data_type()),
@@ -105,9 +109,10 @@
ICLKernel::configure_internal(win);
}
-Status CLDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status CLDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+ const Size2D &dilation)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
return Status{};
}
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index d4c1bec..78cc559 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,7 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -36,74 +37,78 @@
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
if(output->tensor_shape().total_size() > 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
}
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
{
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps());
+
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
- constexpr unsigned int num_elems_processed_per_iteration = 4;
+ // CLDequantizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
- // Configure window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- AccessWindowStatic min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
-
- // Update window and padding
- bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
-
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_tuple(err, win);
+ return std::make_tuple(Status{}, win);
}
} // namespace
CLDequantizationLayerKernel::CLDequantizationLayerKernel()
- : _input(nullptr), _output(nullptr), _min_max(nullptr)
+ : _input(nullptr), _output(nullptr)
{
}
-void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
+void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
- _input = input;
- _output = output;
- _min_max = min_max;
+ _input = input;
+ _output = output;
+
+ const int vec_size_x = 16 / output->info()->element_size();
+ const int output_width_x = output->info()->tensor_shape().x();
+ const bool multi_access_x = (output_width_x / vec_size_x > 0);
+
+ // Create and update the window (if needed)
+ Window win = calculate_max_window(*output->info());
+ if(multi_access_x)
+ {
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer"));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().offset));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer", build_opts.options()));
}
-Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
-
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
return Status{};
}
@@ -115,20 +120,12 @@
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
Window slice = window_collapsed.first_slice_window_3D();
- Window min_max_window = window;
- min_max_window.set(Window::DimX, Window::Dimension(0, 0, 0));
- min_max_window.set(Window::DimY, Window::Dimension(0, _min_max->info()->dimension(1), 1));
- min_max_window.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window min_max_slice = min_max_window.first_slice_window_1D();
-
do
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
- add_1D_tensor_argument(idx, _min_max, min_max_slice);
enqueue(queue, *this, slice);
}
- while(window_collapsed.slide_window_slice_3D(slice) && min_max_window.slide_window_slice_1D(min_max_slice));
+ while(window_collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 471b320..12affa9 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -284,7 +284,7 @@
TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
// Output auto inizialitation if not yet initialized
- // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
+ // TODO(COMPMID-2078): input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
auto_init_if_empty(*output, output_shape,
1,
input->data_type(),
@@ -363,7 +363,7 @@
TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
// Output auto inizialitation if not yet initialized
- // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
+ // TODO(COMPMID-2078): input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
auto_init_if_empty(*output->info(),
output_shape,
1,
diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
index 37eeeb7..63c9244 100644
--- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -281,7 +281,7 @@
{
const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
+ return BorderSize{ 0, border, 0, 0 };
}
/** Arithmetic operations with saturation*/
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
new file mode 100644
index 0000000..b04293d
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+ ARM_COMPUTE_UNUSED(idx, config);
+
+ auto_init_if_empty(*output, input->clone()->set_num_channels(2));
+
+ Window win = calculate_max_window(*output, Steps());
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
+ : _input(nullptr), _output(nullptr), _idx(nullptr)
+{
+}
+
+void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
+
+ _input = input;
+ _output = output;
+ _idx = idx;
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(input->info()->num_channels()));
+ build_opts.add_option_if(config.conjugate, "-DCONJ");
+ std::string kernel_name = "fft_digit_reverse_axis_" + support::cpp11::to_string(config.axis);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), idx->info(), config);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+
+ return Status{};
+}
+
+void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ add_1D_tensor_argument(idx, _idx, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
new file mode 100644
index 0000000..83d55b7
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+ if(output != nullptr)
+ {
+ auto_init_if_empty(*output, *input);
+ }
+
+ // Setup window steps
+ Steps steps;
+ steps.set(config.axis, config.radix);
+
+ Window win = calculate_max_window(*input, steps);
+ if(output != nullptr)
+ {
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLFFTRadixStageKernel::CLFFTRadixStageKernel()
+ : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+
+ _input = input;
+ _output = output;
+ _run_in_place = (output == nullptr) || (output == input);
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+
+ // Create kernel
+ std::string kernel_name = "fft";
+ kernel_name += "_radix_" + support::cpp11::to_string(config.radix);
+ kernel_name += (config.is_first_stage) ? "_first_stage" : "";
+ kernel_name += "_axis_" + support::cpp11::to_string(config.axis);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Set static arguments if not the first stage
+ if(!config.is_first_stage)
+ {
+ const unsigned int Ni = config.Nx * config.radix;
+ const float exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
+ unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<cl_uint>(idx++, config.Nx);
+ _kernel.setArg<cl_uint>(idx++, Ni);
+ _kernel.setArg<cl_float>(idx, exp_const);
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(), config);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+ const bool run_in_place = (output == nullptr) || (output == input);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ (run_in_place) ? nullptr : output->clone().get(),
+ config)
+ .first);
+
+ return Status{};
+}
+
+std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
+{
+ return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+}
+
+void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ if(!_run_in_place)
+ {
+ add_3D_tensor_argument(idx, _output, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
new file mode 100644
index 0000000..59f1fd7
--- /dev/null
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps());
+
+ if(output != nullptr)
+ {
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
+
+ // CLFFTScaleKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+ }
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLFFTScaleKernel::CLFFTScaleKernel()
+ : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
+
+ _input = input;
+ _output = output;
+ _run_in_place = (output == nullptr) || (output == input);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+ build_opts.add_option_if(config.conjugate, "-DCONJ");
+ std::string kernel_name = "fft_scale_conj";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Set static arguments
+ unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<cl_float>(idx, config.scale);
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), _run_in_place ? nullptr : output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
+
+Status CLFFTScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config)
+{
+ ARM_COMPUTE_UNUSED(config);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+ return Status{};
+}
+
+void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ if(!_run_in_place)
+ {
+ add_3D_tensor_argument(idx, _output, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index e14b8a3..150d9b6 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -139,7 +139,7 @@
epsilon));
// Configure kernel window
- const unsigned int num_elems_processed_per_iteration_x = 16 / conv_weights->info()->element_size();
+ const unsigned int num_elems_processed_per_iteration_x = 4;
const int output_width_x = conv_weights->info()->tensor_shape().x();
const bool multi_access_x = (output_width_x / num_elems_processed_per_iteration_x > 0);
@@ -216,6 +216,6 @@
{
add_1D_tensor_argument(idx, _bn_gamma, vector_slice);
}
- enqueue(queue, *this, slice, lws_hint());
+ enqueue(queue, *this, slice);
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
index e9be1a6..a8c1704 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
@@ -165,7 +165,7 @@
} // namespace
CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
{
}
@@ -181,6 +181,7 @@
_output = output;
_reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
_k = gemm_info.k();
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
// Check if we need to slide the matrix B
const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
@@ -201,6 +202,9 @@
build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -302,7 +306,7 @@
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
}
while(window.slide_window_slice_3D(slice));
}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
new file mode 100644
index 0000000..923b952
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
+
+ const int m = gemm_info.m();
+ const int n = gemm_info.n();
+ const int k = gemm_info.k();
+
+ TensorShape tensor_shape1{ input1->tensor_shape() };
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+ const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
+ if(gemm_info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+{
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+
+ Window win{};
+ Window win_out{};
+ bool window_changed = false;
+
+ // In case both input and output have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+ {
+ reinterpret_output_as_3d = false;
+ }
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
+
+ TensorInfo tmp_info(*output);
+
+ if(reinterpret_output_as_3d)
+ {
+ // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(output->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = rhs_info.n0;
+ num_elems_processed_per_iteration_y = lhs_info.m0;
+
+ // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+ // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+ const int m = gemm_info.m();
+ const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+ win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic input0_access(input0, 0, 0,
+ ceil_to_multiple(input0->dimension(0), lhs_info.k0),
+ input0->dimension(1) + bottom_pad);
+ AccessWindowStatic input1_access(input1, 0, 0,
+ input1->dimension(0),
+ input1->dimension(1));
+ AccessWindowStatic output_access(output, 0, 0,
+ ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+ output->dimension(1) + bottom_pad);
+
+ window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+ update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
+
+ output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info));
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+ // In case both input and output have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+ {
+ _reinterpret_input_as_3d = false;
+ _reinterpret_output_as_3d = false;
+ }
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
+ _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+ build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+ build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+
+ std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
+ kernel_name += rhs_info.transpose ? "t" : "nt";
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.h0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.interleave);
+}
+
+Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+ input1->clone().get(),
+ output->clone().get(),
+ lhs_info,
+ rhs_info,
+ gemm_info,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
+void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ if(_input1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ if(_reinterpret_input_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+ const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ if(_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+ const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if(!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input0, slice);
+ add_2D_tensor_argument(idx, _input1, slice_b);
+ add_2D_tensor_argument(idx, _output, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
index 83af0c6..8fba342 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,7 +51,6 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
- ARM_COMPUTE_RETURN_ERROR_ON(bias == nullptr && a_offset == 0 && b_offset == 0);
ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index b6816ac..8969124 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
@@ -168,7 +168,7 @@
} // namespace
CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
{
}
@@ -184,6 +184,7 @@
_output = output;
_reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
_k = gemm_info.k();
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
// Check if we need to slide the matrix B
const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
@@ -206,6 +207,9 @@
build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -308,7 +312,7 @@
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
}
while(window.slide_window_slice_3D(slice));
}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
new file mode 100644
index 0000000..2437265
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+
+ const int m = gemm_info.m();
+ const int n = gemm_info.n();
+ const int k = gemm_info.k();
+
+ TensorShape tensor_shape1{ input1->tensor_shape() };
+ tensor_shape1.set(0, n);
+ tensor_shape1.set(1, k);
+
+ const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+ const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
+ if(gemm_info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+{
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+
+ Window win{};
+ Window win_out{};
+ bool window_changed = false;
+
+ // In case both input and output have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+ {
+ reinterpret_output_as_3d = false;
+ }
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
+
+ TensorInfo tmp_info(*output);
+
+ if(reinterpret_output_as_3d)
+ {
+ // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(output->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = rhs_info.n0;
+ num_elems_processed_per_iteration_y = lhs_info.m0;
+
+ // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+ // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+ const int m = gemm_info.m();
+ const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+ win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic input0_access(input0, 0, 0,
+ input0->dimension(0),
+ input0->dimension(1) + bottom_pad);
+ AccessWindowStatic input1_access(input1, 0, 0,
+ input1->dimension(0),
+ input1->dimension(1));
+ AccessWindowStatic output_access(output, 0, 0,
+ ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+ output->dimension(1) + bottom_pad);
+
+ window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+ update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
+
+ output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
+{
+}
+
+void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), alpha, lhs_info, rhs_info, gemm_info));
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+ // In case both input and output have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+ {
+ _reinterpret_input_as_3d = false;
+ _reinterpret_output_as_3d = false;
+ }
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
+ _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
+ build_opts.add_option_if(std::abs(1.0f - alpha) > 0.00001f, "-DALPHA=" + float_to_string_with_full_precision(alpha));
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+ build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+ build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+ build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+
+ std::string kernel_name("gemm_mm_reshaped_only_rhs_");
+ kernel_name += rhs_info.transpose ? "t" : "nt";
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+ _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(gemm_info.k());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(lhs_info.m0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.n0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.k0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.h0);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(rhs_info.interleave);
+}
+
+Status CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, const GEMMLHSMatrixInfo &lhs_info,
+ const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, alpha, lhs_info, rhs_info, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+ input1->clone().get(),
+ output->clone().get(),
+ lhs_info,
+ rhs_info,
+ gemm_info,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
+void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ if(_input1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_matrix_b = slice;
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ if(_reinterpret_input_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+ const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ if(_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+ const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if(!_slide_matrix_b)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input0, slice);
+ add_2D_tensor_argument(idx, _input1, slice_b);
+ add_2D_tensor_argument(idx, _output, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
index bd523c8..3b45b07 100644
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,17 +29,17 @@
void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
{
- const int16_t matrix[] = { 1, 4, 6, 4, 1 };
+ const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
// Set arguments
- CLSeparableConvolution5x5HorKernel::configure(input, output, matrix, border_undefined);
+ CLSeparableConvolution5x5HorKernel::configure(input, output, matrix.data(), border_undefined);
}
void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
{
- const uint32_t scale = 256;
- const int16_t matrix[] = { 1, 4, 6, 4, 1 };
+ const uint32_t scale = 256;
+ const std::array<int16_t, 5> matrix = { 1, 4, 6, 4, 1 };
// Set arguments
- CLSeparableConvolution5x5VertKernel::configure(input, output, matrix, scale, border_undefined);
+ CLSeparableConvolution5x5VertKernel::configure(input, output, matrix.data(), scale, border_undefined);
}
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
index 6b729c8..c9c7bf3 100644
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,7 +38,7 @@
BorderSize CLGaussianPyramidHorKernel::border_size() const
{
- return BorderSize(0, 2);
+ return BorderSize{ 0, 2 };
}
void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output)
@@ -130,7 +130,7 @@
BorderSize CLGaussianPyramidVertKernel::border_size() const
{
- return BorderSize(2, 0);
+ return BorderSize{ 2, 0 };
}
void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output)
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
new file mode 100644
index 0000000..e3f2a96
--- /dev/null
+++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <map>
+
+using namespace arm_compute;
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int height_offset, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
+{
+ num_elems_processed_per_iteration = 4;
+ // The window needs to be based on input as we copy all the heights of input
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, height_offset, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) + height_offset > output->dimension(Window::DimY));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != output->dimension(0));
+ for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ return Status{};
+}
+} // namespace
+
+CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel()
+ : _input(nullptr), _output(nullptr), _height_offset(0), _num_elems_processed_per_iteration()
+{
+}
+
+Status CLHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+{
+ unsigned int num_elems_processed_per_iteration;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), height_offset, output->clone().get(), num_elems_processed_per_iteration).first);
+ return Status{};
+}
+
+void CLHeightConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int height_offset, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), height_offset, output->info()));
+
+ _input = input;
+ _output = output;
+ _height_offset = height_offset;
+
+ auto win_config = validate_and_configure_window(input->info(), height_offset, output->info(), _num_elems_processed_per_iteration);
+
+ // Add build options
+ CLBuildOptions build_opts;
+
+ switch(input->info()->element_size())
+ {
+ case 1:
+ {
+ build_opts.add_option("-DDATA_TYPE=uchar");
+ break;
+ }
+ case 2:
+ {
+ build_opts.add_option("-DDATA_TYPE=short");
+ break;
+ }
+ case 4:
+ {
+ build_opts.add_option("-DDATA_TYPE=int");
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported input data type.");
+ break;
+ }
+ }
+
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
+ build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+ {
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_height", build_opts.options()));
+ // Configure kernel window
+
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ ICLKernel::configure_internal(std::get<1>(win_config));
+}
+
+void CLHeightConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, window);
+ add_4D_tensor_argument(idx, _output, window);
+ enqueue(queue, *this, window);
+}
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index e33dab0..cb2e294 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -117,7 +117,7 @@
{
case 0:
kernel_name = "x";
- idx = num_arguments_per_1D_tensor() * 3;
+ idx = num_arguments_per_2D_tensor() * 3;
break;
case 1:
kernel_name = "y";
@@ -169,17 +169,17 @@
case 0:
{
window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
- Window in_slice = window.first_slice_window_1D();
- Window sum_slice = window_sum.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_2D();
+ Window sum_slice = window_sum.first_slice_window_2D();
do
{
unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input, in_slice);
- add_1D_tensor_argument(idx, _sum, sum_slice);
- add_1D_tensor_argument(idx, _output, in_slice);
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _sum, sum_slice);
+ add_2D_tensor_argument(idx, _output, in_slice);
enqueue(queue, *this, in_slice);
}
- while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+ while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
}
break;
case 1:
diff --git a/src/core/CL/kernels/CLMemsetKernel.cpp b/src/core/CL/kernels/CLMemsetKernel.cpp
index ab53897..80caf94 100644
--- a/src/core/CL/kernels/CLMemsetKernel.cpp
+++ b/src/core/CL/kernels/CLMemsetKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,27 +35,38 @@
namespace arm_compute
{
CLMemsetKernel::CLMemsetKernel()
- : ICLKernel(), _tensor(nullptr)
+ : ICLKernel(), _tensor(nullptr), _full_window()
{
}
void CLMemsetKernel::configure(ICLTensor *tensor,
- const PixelValue &constant_value)
+ const PixelValue &constant_value,
+ Window *window)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(tensor->info(), constant_value, window));
+
_tensor = tensor;
- const DataType data_type = tensor->info()->data_type();
- const int vec_size_x = 16 / tensor->info()->element_size();
- const int output_width_x = tensor->info()->tensor_shape().x();
- const bool multi_access_x = (output_width_x / vec_size_x > 0);
+ const DataType data_type = tensor->info()->data_type();
+ const int vec_size_x = 16 / tensor->info()->element_size();
// Create and update the window (if needed)
- Window win = calculate_max_window(*tensor->info());
+ _full_window = calculate_max_window(*tensor->info());
+ Window win = _full_window;
+ if(window != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
+ win = *window;
+ }
+
+ const int output_width_x = win.num_iterations(0);
+ const bool multi_access_x = output_width_x >= vec_size_x;
+ const bool remainder_x = output_width_x % vec_size_x > 0;
+
if(multi_access_x)
{
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
}
ICLKernel::configure_internal(win);
@@ -64,14 +75,18 @@
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("memset", build_opts.options()));
}
-Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value)
+Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window)
{
ARM_COMPUTE_UNUSED(tensor);
ARM_COMPUTE_UNUSED(constant_value);
+ if(window != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
+ }
return Status{};
}
@@ -81,15 +96,15 @@
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
// Collapse all the batches on the third
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
- Window slice = collapsed.first_slice_window_2D();
+ Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
do
{
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _tensor, slice);
+ add_3D_tensor_argument(idx, _tensor, slice);
enqueue(queue, *this, slice);
}
- while(collapsed.slide_window_slice_2D(slice));
+ while(collapsed.slide_window_slice_3D(slice));
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index fa7b678..92b5f8d 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -127,7 +127,7 @@
Iterator output(_output, window_output);
// Reset output
- execute_window_loop(window_output, [&](const Coordinates & id)
+ execute_window_loop(window_output, [&](const Coordinates &)
{
auto *ptr = reinterpret_cast<float *>(output.ptr());
ptr[0] = std::numeric_limits<float>::max();
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 286b94e..dda9b16 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,8 +38,8 @@
#include <set>
#include <string>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
constexpr unsigned int num_elems_processed_per_iteration = 16;
@@ -274,5 +274,141 @@
{
const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
+ return BorderSize{ 0, border, 0, 0 };
}
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration_complex = 1;
+
+Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);
+
+ const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+ // Validate in case of configured output
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
+ auto_init_if_empty(*output, out_info);
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+ AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access)
+ || update_window_and_padding(win_input2, input2_access)
+ || update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLComplexPixelWiseMultiplicationKernel::CLComplexPixelWiseMultiplicationKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLComplexPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pixelwise_mul_complex"));
+
+ ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+
+ return Status{};
+}
+
+void CLComplexPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ bool can_collapse = true;
+ if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+ const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
+
+BorderSize CLComplexPixelWiseMultiplicationKernel::border_size() const
+{
+ const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
+ return BorderSize{ 0, border, 0, 0 };
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 7081688..7ccbda9 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -78,7 +78,6 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type()));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
}
@@ -201,6 +200,17 @@
const int pool_pad_top = pad_stride_info.pad_top();
const int pool_pad_left = pad_stride_info.pad_left();
+ // Set build options
+ CLBuildOptions build_opts;
+
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+ {
+ build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+ }
+
// Check output dimensions
auto_init(input->info(), output->info(), pool_info);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
@@ -212,8 +222,6 @@
const DataType data_type = input->info()->data_type();
- // Set build options
- CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
@@ -222,6 +230,7 @@
build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+
build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
// Create kernel
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index 9028b0f..374b22e 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,7 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -36,73 +37,76 @@
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
- if(output->tensor_shape().total_size() > 0)
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ if((output != nullptr) && (output->total_size() != 0))
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
}
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
{
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps());
+
// Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8);
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::QASYMM8);
- constexpr unsigned int num_elems_processed_per_iteration = 4;
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
- // Configure window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- AccessWindowStatic min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
-
- // Update window and padding
- bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
-
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_tuple(err, win);
+ return std::make_tuple(Status{}, win);
}
} // namespace
CLQuantizationLayerKernel::CLQuantizationLayerKernel()
- : _input(nullptr), _output(nullptr), _min_max(nullptr)
+ : _input(nullptr), _output(nullptr)
{
}
-void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *min_max)
+void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
- _input = input;
- _output = output;
- _min_max = min_max;
+ _input = input;
+ _output = output;
+
+ const int vec_size_x = 16 / input->info()->element_size();
+ const int input_width_x = input->info()->tensor_shape().x();
+ const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+ // Create and update the window (if needed)
+ Window win = calculate_max_window(*input->info());
+ if(multi_access_x)
+ {
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer"));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- ICLKernel::configure_internal(std::get<1>(win_config));
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(output->info()->quantization_info().offset));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer", build_opts.options()));
}
-Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
return Status{};
}
@@ -117,13 +121,9 @@
do
{
- Window slice_min_max = slice.shift_dimensions(2);
- slice_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
add_3D_tensor_argument(idx, _output, slice);
- add_1D_tensor_argument(idx, _min_max, slice_min_max);
enqueue(queue, *this, slice);
}
while(window_collapsed.slide_window_slice_3D(slice));
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 1f4cff3..db4850f 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -47,7 +47,14 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ if(input->num_channels() == 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
@@ -55,7 +62,6 @@
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, "Not supported operation for QASYMM8");
@@ -78,7 +84,7 @@
output_shape.set(axis, 1);
const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
DataType output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
- auto_init_if_empty(*output, output_shape, 1, output_data_type, input->quantization_info());
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -161,6 +167,7 @@
build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN");
build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
+ build_opts.add_option_if(input->info()->num_channels() == 2, "-DCOMPLEX");
switch(op)
{
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
index 46aa074..be2a44b 100644
--- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -138,7 +138,7 @@
BorderSize CLSobel5x5VertKernel::border_size() const
{
- return BorderSize(2, 0);
+ return BorderSize{ 2, 0 };
}
void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
index 0c94e88..a4a20c1 100644
--- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -141,7 +141,7 @@
BorderSize CLSobel7x7VertKernel::border_size() const
{
- return BorderSize(3, 0);
+ return BorderSize{ 3, 0 };
}
void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 403256b..e2d9881 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,11 +62,12 @@
const double beta_multiplier = std::min(
1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
- (1ll << 31) - 1.0);
- int input_beta_multiplier, input_beta_left_shift;
+ (1LL << 31) - 1.0);
+ int input_beta_multiplier;
+ int input_beta_left_shift;
quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
- const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1ll << (31 - scaled_diff_int_bits)) / (1ll << input_beta_left_shift);
+ const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
const int diff_min = -1.f * std::floor(max_input_rescaled);
CLBuildOptions build_opts;
@@ -337,7 +338,7 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
// Note: output should always have a scale of 1/256 and offset 0
- const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
+ const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.F / 256, 0);
const bool is_quantized_asymmetric = (input->info()->data_type() == DataType::S32);
const DataType output_data_type = is_quantized_asymmetric ? DataType::QASYMM8 : input->info()->data_type();
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index c40f3c9..5a6b958 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -115,7 +115,9 @@
const TensorShape &input_shape = input->info()->tensor_shape();
- Coordinates starts_abs, ends_abs, final_strides;
+ Coordinates starts_abs;
+ Coordinates ends_abs;
+ Coordinates final_strides;
std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
input_shape,
starts, ends, strides,
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
index d58cef5..5f266c5 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
@@ -35,6 +35,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/helpers/tensor_info.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "support/ToolchainSupport.h"
@@ -111,14 +112,16 @@
build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
- if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info())
+ // If input have different quantization info set quantization parameters needed for the re-quantization process
+ const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info());
+ if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo)
{
build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset));
- build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset));
build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
}
// Create kernel
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index 9cbb713..54edaaf 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -35,6 +35,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/helpers/tensor_info.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "support/ToolchainSupport.h"
@@ -133,18 +134,20 @@
build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
- if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info())
+ // If input have different quantization info set quantization parameters needed for the re-quantization process
+ const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info(), input3->info(), input4->info());
+ if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo)
{
build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset));
- build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale));
- build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset));
build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale));
build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().offset));
build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().scale));
build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().offset));
build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().scale));
+ build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale));
}
// Create kernel
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 84b5ea2..bf3a00d 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -139,7 +139,7 @@
} // namespace
CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr)
+ : _input(nullptr), _bias(nullptr), _output(nullptr), _is_nhwc(false)
{
}
@@ -152,9 +152,10 @@
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info, act_info));
- _input = input;
- _bias = bias;
- _output = output;
+ _input = input;
+ _bias = bias;
+ _output = output;
+ _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
// Compute num_tiles_x
const Size2D input_dimensions = winograd_info.input_dimensions;
@@ -253,7 +254,7 @@
add_1D_tensor_argument(idx1, _bias, slice_biases);
}
- if(_output->info()->data_layout() == DataLayout::NHWC)
+ if(_is_nhwc)
{
unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((_bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
_kernel.setArg(idx2, static_cast<int>(_output->info()->total_size() - _output->info()->strides_in_bytes().y()));
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index d77d9c1..d29c0f7 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -73,14 +73,15 @@
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
// Initialize _scaled_output buffer
- const int width_scaled = _output->info()->dimension(0);
- const int height_scaled = _output->info()->dimension(1);
- const int stride_x = _info.stride().first;
- const int stride_y = _info.stride().second;
- const int start_x = _info.pad().first;
- const int start_y = _inner_border.second + _info.pad().second;
- const int end_y = height_scaled - _info.pad().second;
- const int end_x = width_scaled - _inner_border.first - _info.pad().first;
+ const int width_scaled = _output->info()->dimension(0);
+ const int height_scaled = _output->info()->dimension(1);
+ const int stride_x = _info.stride().first;
+ const int stride_y = _info.stride().second;
+ const int start_x = _info.pad().first;
+ const int start_y = _inner_border.second + _info.pad().second;
+ const int end_y = height_scaled - _info.pad().second;
+ const int end_x = width_scaled - _inner_border.first - _info.pad().first;
+ const size_t element_size = _input->info()->element_size();
std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
@@ -93,9 +94,9 @@
Iterator in(_input, window);
Iterator out(_output, window_out);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
- *(reinterpret_cast<float *>(out.ptr())) = *(reinterpret_cast<const float *>(in.ptr()));
+ memcpy(out.ptr(), in.ptr(), element_size);
},
in, out);
}
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index e7b4365..45cce66 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Error.h"
+#include <array>
#include <cstdarg>
#include <cstdio>
#include <iostream>
@@ -32,11 +33,11 @@
Status arm_compute::create_error_va_list(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, va_list args)
{
- char out[512];
- int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
- vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+ std::array<char, 512> out{ 0 };
+ int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", function, file, line);
+ vsnprintf(out.data() + offset, out.size() - offset, msg, args);
- return Status(error_code, std::string(out));
+ return Status(error_code, std::string(out.data()));
}
Status arm_compute::create_error(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, ...)
diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index 25ac02e..0af8c7d 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -387,7 +387,7 @@
return kernel;
}
-const std::string GCKernelLibrary::preprocess_shader(const std::string &shader_source) const
+std::string GCKernelLibrary::preprocess_shader(const std::string &shader_source) const
{
enum class ParserStage
{
@@ -399,7 +399,7 @@
// Define a GLES compute shader parser function
std::function<std::string(const std::string &, ParserStage, int)> cs_parser;
- cs_parser = [&](const std::string & src, ParserStage stage, int nested_level) -> std::string
+ cs_parser = [&](const std::string & src, ParserStage stage, int) -> std::string
{
std::string dst;
diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
index 69ac50b..49b3954 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,7 +53,7 @@
Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
- float tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+ float tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
}
@@ -66,7 +66,7 @@
Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
- uvec2 tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+ uvec2 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
}
#endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
index 36d1b29..6f70efe 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,28 +38,18 @@
using namespace arm_compute;
GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel()
- : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+ : _input(nullptr), _output(nullptr), _depth_offset(0)
{
}
-
-BorderSize GCDepthConcatenateLayerKernel::border_size() const
-{
- return BorderSize(_top_bottom, _left_right);
-}
-
void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
- // The gaps between the two lowest dimensions of input and output need to be divisible by 2
- // Otherwise it is not clear how the padding should be added onto the input tensor
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) != output->info()->dimension(Window::DimX));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) != output->info()->dimension(Window::DimY));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
_input = input;
_output = output;
@@ -73,35 +63,20 @@
build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
- // Configure kernel window
- _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
- _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
-
- build_opts.emplace("#define OFFSET_X " + support::cpp11::to_string(_left_right));
- build_opts.emplace("#define OFFSET_Y " + support::cpp11::to_string(_top_bottom));
-
// Create kernel
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
unsigned int num_elems_processed_per_iteration = 1;
- unsigned int num_elems_read_per_iteration = 1;
- if(input->info()->data_type() == DataType::F32)
- {
- num_elems_processed_per_iteration = 1;
- num_elems_read_per_iteration = 1;
- }
- else if(input->info()->data_type() == DataType::F16)
+ if(input->info()->data_type() == DataType::F16)
{
num_elems_processed_per_iteration = 4;
- num_elems_read_per_iteration = 4;
}
- const unsigned int num_rows_read_per_iteration = 1;
// The window needs to be based on input as we copy all the depths of input
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
- AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
@@ -118,11 +93,9 @@
_output->set_needs_shifting(true);
- Window slice = window.first_slice_window_3D();
Window slice_in = window.first_slice_window_3D();
Window slice_out = window.first_slice_window_3D();
- slice.shift(Window::DimX, -(_output->info()->padding()).left);
slice_out.set(Window::DimZ, Window::Dimension(_depth_offset));
do
@@ -133,7 +106,7 @@
_kernel.update_shader_params();
- enqueue(*this, slice);
+ enqueue(*this, slice_in);
}
- while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
+ while(window.slide_window_slice_3D(slice_in));
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index f225ebd..50171a1 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -333,7 +333,10 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+ unsigned int pool_pad_x;
+ unsigned int pool_pad_y;
+ unsigned int pool_stride_x;
+ unsigned int pool_stride_y;
std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
diff --git a/src/core/HOGInfo.cpp b/src/core/HOGInfo.cpp
index 4f99455..bfee12c 100644
--- a/src/core/HOGInfo.cpp
+++ b/src/core/HOGInfo.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,24 +68,24 @@
{
ARM_COMPUTE_ERROR_ON(_cell_size.width == 0 || _cell_size.height == 0);
- return Size2D(_block_size.width / _cell_size.width,
- _block_size.height / _cell_size.height);
+ return Size2D{ _block_size.width / _cell_size.width,
+ _block_size.height / _cell_size.height };
}
Size2D HOGInfo::num_cells_per_block_stride() const
{
ARM_COMPUTE_ERROR_ON(_cell_size.width == 0 || _cell_size.height == 0);
- return Size2D(_block_stride.width / _cell_size.width,
- _block_stride.height / _cell_size.height);
+ return Size2D{ _block_stride.width / _cell_size.width,
+ _block_stride.height / _cell_size.height };
}
Size2D HOGInfo::num_block_positions_per_image(const Size2D &image_size) const
{
ARM_COMPUTE_ERROR_ON(_block_stride.width == 0 || _block_stride.height == 0);
- return Size2D(((image_size.width - _block_size.width) / _block_stride.width) + 1,
- ((image_size.height - _block_size.height) / _block_stride.height) + 1);
+ return Size2D{ ((image_size.width - _block_size.width) / _block_stride.width) + 1,
+ ((image_size.height - _block_size.height) / _block_stride.height) + 1 };
}
const Size2D &HOGInfo::cell_size() const
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index e6c80e8..7cf04b5 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -64,7 +64,7 @@
const size_t line_size = src_info->element_size() * src_info->dimension(0);
- execute_window_loop(win_src, [&](const Coordinates & id)
+ execute_window_loop(win_src, [&](const Coordinates &)
{
memcpy(dst_it.ptr(), src_it.ptr(), line_size);
},
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index e0c2891..62285e0 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,7 +49,7 @@
Iterator input2(in2, window);
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t input1_val = vld1q_u8(input1.ptr());
const uint8x16_t input2_val = vld1q_u8(input2.ptr());
@@ -78,7 +78,7 @@
Iterator input2(in2, window);
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
int16x8x2_t input1_val = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
int16x8x2_t input2_val = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -93,7 +93,7 @@
Iterator input2(in2, window);
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t input1_val = vld1q_u8(input1.ptr());
const int16x8x2_t input2_val =
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index dae0800..d601adc 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -126,7 +126,7 @@
const float16x8_t scale_val = vdupq_n_f16(1.f - _alpha);
const float16x8_t scale_val2 = vdupq_n_f16(_alpha);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
fp16::acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
},
@@ -271,7 +271,7 @@
Iterator input(_input, window);
Iterator accum(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
acc_v16_u8(input.ptr(), accum.ptr());
},
@@ -314,7 +314,7 @@
const float32x4_t scale_val = vdupq_n_f32(1.f - _alpha);
const float32x4_t scale_val2 = vdupq_n_f32(_alpha);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
},
@@ -353,7 +353,7 @@
Iterator input(_input, window);
Iterator accum(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
acc_sq_v16_u8(input.ptr(), _shift, accum.ptr());
},
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index b67396c..8de8db9 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -195,7 +195,7 @@
const auto a = static_cast<T>(_act_info.a());
const auto b = static_cast<T>(_act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates & id)
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
const auto output_ptr = reinterpret_cast<T *>(output.ptr());
@@ -319,6 +319,7 @@
const qasymm8_t b = sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset);
const qasymm8_t const_0 = sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset);
const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
+ const auto vconst_1 = vdupq_n_f32(1.f);
// Initialise scale/offset for re-quantization
float s = qi_in.scale / qi_out.scale;
@@ -326,7 +327,7 @@
float32x4_t vs = vdupq_n_f32(s);
float32x4_t vo = vdupq_n_f32(o);
- execute_window_loop(win_collapsed, [&](const Coordinates & id)
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
const auto output_ptr = reinterpret_cast<T *>(output.ptr());
@@ -361,41 +362,20 @@
}
else if(act == ActivationFunction::LOGISTIC)
{
- const auto scale_in = vdupq_n_f32(qi_in.scale);
- const auto off_in = vdupq_n_f32(qi_in.offset);
- const auto scale_out = vdupq_n_f32(qi_out.scale);
- const auto off_out = vdupq_n_f32(qi_out.offset);
- const auto vconst_1 = vdupq_n_f32(1.f);
-
- const auto vin_low = wrapper::vgetlow(vin);
- const auto vin_high = wrapper::vgethigh(vin);
- uint16x8_t vin_low_u16x8 = wrapper::vmovl(vin_low);
- uint16x8_t vin_high_u16x8 = wrapper::vmovl(vin_high);
- // Convert uint16 vectors to uint32 vectors
- uint32x4_t A_u32x4 = wrapper::vmovl(wrapper::vgetlow(vin_low_u16x8));
- uint32x4_t B_u32x4 = wrapper::vmovl(wrapper::vgethigh(vin_low_u16x8));
- uint32x4_t C_u32x4 = wrapper::vmovl(wrapper::vgetlow(vin_high_u16x8));
- uint32x4_t D_u32x4 = wrapper::vmovl(wrapper::vgethigh(vin_high_u16x8));
- // Convert uint32 vectors to float32 vectors
- float32x4_t A_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(A_u32x4), off_in), scale_in);
- float32x4_t B_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(B_u32x4), off_in), scale_in);
- float32x4_t C_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(C_u32x4), off_in), scale_in);
- float32x4_t D_f32x4 = wrapper::vmul(wrapper::vsub(vcvtq_f32_u32(D_u32x4), off_in), scale_in);
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
// Perform activation
- A_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(A_f32x4))));
- B_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(B_f32x4))));
- C_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(C_f32x4))));
- D_f32x4 = wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(D_f32x4))));
- // Convert float32 vectors to uint32 vectors
- A_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(A_f32x4, scale_out), off_out));
- B_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(B_f32x4, scale_out), off_out));
- C_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(C_f32x4, scale_out), off_out));
- D_u32x4 = vcvtq_u32_f32(wrapper::vadd(wrapper::vdiv(D_f32x4, scale_out), off_out));
- // Convert uint32 vectors to uint16 vectors (with saturation)
- vin_low_u16x8 = wrapper::vcombine(wrapper::vqmovn(A_u32x4), wrapper::vqmovn(B_u32x4));
- vin_high_u16x8 = wrapper::vcombine(wrapper::vqmovn(C_u32x4), wrapper::vqmovn(D_u32x4));
- // convert uint16 vectors to uint8 vectors (with saturation)
- tmp = wrapper::vcombine(wrapper::vqmovn(vin_low_u16x8), wrapper::vqmovn(vin_high_u16x8));
+ const float32x4x4_t tmp_dep =
+ {
+ {
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+ }
+ };
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
}
else
{
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index ffa578f..ca79a0a 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -84,7 +84,7 @@
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
const auto output_ptr = reinterpret_cast<T *>(output.ptr());
@@ -120,7 +120,7 @@
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
@@ -165,8 +165,8 @@
const auto window_end_x = static_cast<int>(window.x().end());
const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
- const float output_scale = out->info()->quantization_info().scale;
- const int output_offset = out->info()->quantization_info().offset;
+ const float output_scale = out->info()->quantization_info().scale;
+ const int output_offset = out->info()->quantization_info().offset;
const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale);
const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale);
@@ -192,7 +192,7 @@
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
@@ -234,7 +234,7 @@
vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else //__aarch64__
+#else //__aarch64__
vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
@@ -252,7 +252,7 @@
for(; x < window_end_x; ++x)
{
const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
- *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
+ *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs), RoundingPolicy::TO_NEAREST_UP);
}
},
broadcast_input, non_broadcast_input, output);
@@ -270,7 +270,7 @@
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
@@ -311,7 +311,7 @@
vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else //__aarch64__
+#else //__aarch64__
vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
@@ -330,7 +330,7 @@
{
const float afs = static_cast<int32_t>((*(input1_ptr + x)) - input1_qinfo.offset) * input1_qinfo.scale;
const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - input2_qinfo.offset) * input2_qinfo.scale;
- *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
+ *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs), RoundingPolicy::TO_NEAREST_UP);
}
},
input1, input2, output);
@@ -357,7 +357,7 @@
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
@@ -427,7 +427,7 @@
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index ff8fb84..45e1562 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
@@ -54,7 +55,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t ta1 = vld1q_u8(input1.ptr());
const uint8x16_t ta2 = vld1q_u8(input2.ptr());
@@ -70,7 +71,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t ta1 = vld1q_u8(input1.ptr());
const uint8x16_t ta2 = vld1q_u8(input2.ptr());
@@ -80,13 +81,41 @@
input1, input2, output);
}
+void sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
+ Iterator output(out, window);
+
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ const float32x4x4_t ta1 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input1.ptr())), in1->info()->quantization_info());
+ const float32x4x4_t ta2 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input2.ptr())), in2->info()->quantization_info());
+
+ const float32x4x4_t ta3 =
+ {
+ {
+ vsubq_f32(ta1.val[0], ta2.val[0]),
+ vsubq_f32(ta1.val[1], ta2.val[1]),
+ vsubq_f32(ta1.val[2], ta2.val[2]),
+ vsubq_f32(ta1.val[3], ta2.val[3]),
+ }
+ };
+
+ const uint8x16_t result = vquantize(ta3, out->info()->quantization_info());
+
+ vst1q_u8(reinterpret_cast<qasymm8_t *>(output.ptr()), result);
+ },
+ input1, input2, output);
+}
+
void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -110,7 +139,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -150,7 +179,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
@@ -173,7 +202,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const float32x4x4_t ta1 = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
const float32x4x4_t ta2 = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
@@ -198,7 +227,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
@@ -219,7 +248,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
@@ -240,7 +269,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -261,7 +290,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
@@ -282,7 +311,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t av_0 = vld1q_u8(input1.ptr());
const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
@@ -304,7 +333,7 @@
Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t av_0 = vld1q_u8(input1.ptr());
const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
@@ -324,18 +353,34 @@
{
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
+ && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8)
+ && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
+ && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)
+ && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)
+ && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)
+ && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)
+ && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),
+ "You called subtract with the wrong image formats");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP,
+ "Convert policy cannot be WRAP if datatype is QASYMM8");
+
// Validate in case of configured output
if(output.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
!(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
+ && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
@@ -413,6 +458,7 @@
{ "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
{ "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
{ "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 },
+ { "sub_saturate_QASYMM8_QASYMM8_QASYMM8", &sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8 },
{ "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 },
{ "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
{ "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
@@ -469,5 +515,5 @@
{
const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
+ return BorderSize{ 0, border, 0, 0 };
}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
new file mode 100644
index 0000000..6211abc
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_shape_y <= 0);
+
+ const DataLayout data_layout = input->data_layout();
+ const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_y * input->tensor_shape()[idx_height]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel()
+ : _input(nullptr), _block_shape(nullptr), _output(nullptr), _block_shape_x(), _block_shape_y()
+{
+}
+
+void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
+
+ _input = input;
+ _block_shape = block_shape;
+ _output = output;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ ICPPKernel::configure(win);
+}
+
+void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ TensorShape output_shape = compute_batch_to_space_shape(input->info(), block_shape_x, block_shape_y);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info()));
+
+ _input = input;
+ _output = output;
+ _block_shape_x = block_shape_x;
+ _block_shape_y = block_shape_y;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ ICPPKernel::configure(win);
+}
+
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
+ return Status{};
+}
+
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output));
+ return Status{};
+}
+
+void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+ if(_block_shape != nullptr)
+ {
+ // Retrieve the block shapes dynamically
+ _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
+ _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
+ }
+
+ const int batch_size = _input->info()->dimension(3);
+ const int r = (batch_size / (_block_shape_x * _block_shape_y));
+ const int element_size = _input->info()->element_size();
+
+ Window slice_in = window.first_slice_window_3D();
+ Window slice_out = window.first_slice_window_4D();
+
+ // The slice_out slice does not move
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ int batch_id = 0;
+ // Main loop for NCHW and NHWC
+ if(_input->info()->data_layout() == DataLayout::NCHW)
+ {
+ do
+ {
+ Iterator in(_input, slice_in);
+ execute_window_loop(slice_in, [&](const Coordinates & id)
+ {
+
+ const int x = id.x();
+ const int y = id.y();
+ const int z = id.z();
+
+ const int w = batch_id % r;
+ const int out_x = x * _block_shape_x + (batch_id / r) % _block_shape_x;
+ const int out_y = y * _block_shape_y + (batch_id / r) / _block_shape_x;
+ Coordinates output_coords{ out_x, out_y, z, w };
+ memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+ },
+ in);
+ ++batch_id;
+ }
+ while(window.slide_window_slice_3D(slice_in));
+ }
+ else
+ {
+ do
+ {
+ Iterator in(_input, slice_in);
+ execute_window_loop(slice_in, [&](const Coordinates & id)
+ {
+
+ const int z = id.x();
+ const int x = id.y();
+ const int y = id.z();
+
+ const int w = batch_id % r;
+ const int out_x = x * _block_shape_x + (batch_id / r) % _block_shape_x;
+ const int out_y = y * _block_shape_y + (batch_id / r) / _block_shape_x;
+ Coordinates output_coords{ z, out_x, out_y, w };
+ memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+ },
+ in);
+ ++batch_id;
+ }
+ while(window.slide_window_slice_3D(slice_in));
+ }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index ed83286..71312a9 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -106,7 +106,7 @@
Iterator input2(_input2, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
},
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 08d7fe2..5791dcc 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -89,7 +89,7 @@
Iterator input(_input, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
bitwise_not_U8_U8(input.ptr(), output.ptr());
},
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 1b17cc2..8aed9bb 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -102,7 +102,7 @@
Iterator input2(_input2, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
},
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index 9451e8a..e2dcb95 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,7 +98,7 @@
Iterator input2(_input2, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
},
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index 0c97005..7a53f93 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,7 +49,7 @@
const float16x8_t oneovernine = vdupq_n_f16(1.0f / 9.0f);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -160,7 +160,7 @@
const float32x4_t oneovernine = vdupq_n_f32(1.0f / 9.0f);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index fa51a7b..8d822bd 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -950,7 +950,7 @@
Iterator magnitude(_magnitude, window);
Iterator phase(_phase, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
(*_func)(gx.ptr(), gy.ptr(), magnitude.ptr(), phase.ptr());
},
@@ -1034,7 +1034,7 @@
const size_t input1_stride = _magnitude->info()->strides_in_bytes()[1];
const size_t input1_stride_ushort = input1_stride / data_size_from_type(_magnitude->info()->data_type());
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
(*_func)(magnitude.ptr(), phase.ptr(), output.ptr(), input1_stride_ushort, _lower_thr, _upper_thr);
},
@@ -1113,7 +1113,7 @@
const size_t input_stride = _input->info()->strides_in_bytes()[1];
const size_t output_stride = _output->info()->strides_in_bytes()[1];
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
edge_trace_U8_U8(input.ptr(), output.ptr(), input_stride, output_stride);
},
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index 28fb4bd..539154d 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -286,7 +286,7 @@
Iterator p2(_planes[2], win);
Iterator out(_output, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto p0_ptr = static_cast<uint8_t *>(p0.ptr());
const auto p1_ptr = static_cast<uint8_t *>(p1.ptr());
@@ -315,7 +315,7 @@
Iterator p3(_planes[3], win);
Iterator out(_output, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto p0_ptr = static_cast<uint8_t *>(p0.ptr());
const auto p1_ptr = static_cast<uint8_t *>(p1.ptr());
@@ -353,7 +353,7 @@
constexpr auto shift = is_uyvy ? 1 : 0;
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto p0_ptr = static_cast<uint8_t *>(p0.ptr());
const auto p1_ptr = static_cast<uint8_t *>(p1.ptr());
@@ -409,7 +409,7 @@
// Increase step size after iterator is created to calculate stride correctly for multi channel format
out_win.set_dimension_step(Window::DimX, out_win.x().step() * _x_subsampling[1]);
- execute_window_loop(out_win, [&](const Coordinates & id)
+ execute_window_loop(out_win, [&](const Coordinates &)
{
const uint8x8x2_t pixels =
{
@@ -444,7 +444,7 @@
Iterator in(_planes[plane_id], tmp_win);
Iterator out(_output_multi->plane(plane_id), tmp_win);
- execute_window_loop(tmp_win, [&](const Coordinates & id)
+ execute_window_loop(tmp_win, [&](const Coordinates &)
{
const uint8x8_t pixels = vld1_u8(in.ptr());
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index 98b2f28..61e1304 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -192,7 +192,7 @@
Iterator in(_input, win);
Iterator out(_output, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto in_ptr = static_cast<uint8_t *>(in.ptr());
const auto out_ptr = static_cast<uint8_t *>(out.ptr());
@@ -207,7 +207,7 @@
Iterator in(_input, win);
Iterator out(_output, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto in_ptr = static_cast<uint8_t *>(in.ptr());
const auto out_ptr = static_cast<uint8_t *>(out.ptr());
@@ -222,7 +222,7 @@
Iterator in(_input, win);
Iterator out(_output, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto in_ptr = static_cast<uint8_t *>(in.ptr());
const auto out_ptr = static_cast<uint8_t *>(out.ptr());
@@ -242,7 +242,7 @@
Iterator in(_input, win);
Iterator out(_output, win_out);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto in_ptr = static_cast<uint8_t *>(in.ptr());
const auto out_ptr = static_cast<uint8_t *>(out.ptr());
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index 0a10546..b154340 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -317,7 +317,7 @@
template <unsigned int matrix_size>
BorderSize NEConvolutionKernel<matrix_size>::border_size() const
{
- return BorderSize(matrix_size / 2);
+ return BorderSize{ matrix_size / 2 };
}
template <unsigned int matrix_size>
@@ -388,7 +388,7 @@
const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int32x4_t out = vdupq_n_s32(0);
int32x4_t out2 = vdupq_n_s32(0);
@@ -437,7 +437,7 @@
const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int32x4_t out = vdupq_n_s32(0);
int32x4_t out2 = vdupq_n_s32(0);
@@ -496,7 +496,7 @@
const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int32x4_t out = vdupq_n_s32(0);
int32x4_t out2 = vdupq_n_s32(0);
@@ -565,7 +565,7 @@
const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int32x4_t out = vdupq_n_s32(0);
int32x4_t out2 = vdupq_n_s32(0);
@@ -728,7 +728,7 @@
Iterator input(_input, win_in);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -761,7 +761,7 @@
Iterator input(_input, win_in);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -794,7 +794,7 @@
Iterator input(_input, win_in);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -840,7 +840,7 @@
Iterator input(_input, win_in);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -875,7 +875,7 @@
Iterator input(_input, win_in);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -910,7 +910,7 @@
Iterator input(_input, win_in);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -962,7 +962,7 @@
Iterator input(_input, win_in);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -999,7 +999,7 @@
Iterator input(_input, win_in);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -1036,7 +1036,7 @@
Iterator input(_input, win_in);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -1096,7 +1096,7 @@
template <unsigned int matrix_size>
BorderSize NESeparableConvolutionVertKernel<matrix_size>::border_size() const
{
- return BorderSize(matrix_size / 2, 0);
+ return BorderSize{ matrix_size / 2, 0 };
}
template <unsigned int matrix_size>
@@ -1209,7 +1209,7 @@
input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
}
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
uint16x8_t out0 = vdupq_n_u16(0);
uint16x8_t out1 = vdupq_n_u16(0);
@@ -1275,7 +1275,7 @@
input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
}
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int16x8_t out0 = vdupq_n_s16(0);
int16x8_t out1 = vdupq_n_s16(0);
@@ -1343,7 +1343,7 @@
const int32x4_t zero = vdupq_n_s32(0);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int32x4x2_t out0 =
{
@@ -1576,7 +1576,7 @@
input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
}
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int32x4_t out = vdupq_n_s32(0);
int32x4_t out2 = vdupq_n_s32(0);
diff --git a/src/core/NEON/kernels/NECopyKernel.cpp b/src/core/NEON/kernels/NECopyKernel.cpp
index 20496ad..4722c05 100644
--- a/src/core/NEON/kernels/NECopyKernel.cpp
+++ b/src/core/NEON/kernels/NECopyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,7 +50,6 @@
Status NECopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
return Status{};
}
@@ -71,7 +70,7 @@
Iterator input_it(_input, out_slice);
Iterator output_it(_output, out_slice);
- execute_window_loop(out_slice, [&](const Coordinates & id)
+ execute_window_loop(out_slice, [&](const Coordinates &)
{
memcpy(output_it.ptr(), input_it.ptr(), _output->info()->dimension(0) * _output->info()->element_size());
},
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
new file mode 100644
index 0000000..f16eb3e
--- /dev/null
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECropKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+inline float32x4_t load_as_f32(T *ptr)
+{
+ ARM_COMPUTE_UNUSED(ptr);
+ ARM_COMPUTE_ERROR("Type not supported.");
+}
+
+template <>
+inline float32x4_t load_as_f32(float *ptr)
+{
+ return wrapper::vloadq(ptr);
+}
+
+template <>
+inline float32x4_t load_as_f32(int32_t *ptr)
+{
+ return vcvtq_f32_s32(wrapper::vloadq(ptr));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint32_t *ptr)
+{
+ return vcvtq_f32_u32(wrapper::vloadq(ptr));
+}
+
+template <>
+inline float32x4_t load_as_f32(int16_t *ptr)
+{
+ return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr)));
+}
+
+template <>
+inline float32x4_t load_as_f32(uint16_t *ptr)
+{
+ return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr)));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float32x4_t load_as_f32(float16_t *ptr)
+{
+ return vcvt_f32_f16(wrapper::vload(ptr));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename T, bool input_has_single_channel, bool is_width_flipped>
+inline void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
+ int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+{
+ // Reverse elements if width flipped.
+ if(is_width_flipped)
+ {
+ // Collapse first dimension if possible.
+ if(input_has_single_channel)
+ {
+ int32_t x = output_width_start;
+ Coordinates negative_offset(input_offset);
+ negative_offset.set(1, negative_offset[1] - window_step_x + 1);
+ for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
+ {
+ auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
+
+ in = wrapper::vrev64(in);
+ in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+
+ wrapper::vstore(output_ptr + x, in);
+ }
+ input_offset[1] = negative_offset[1] + window_step_x - 1;
+ for(; x < output_width_limit; ++x, --input_offset[1])
+ {
+ *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+ }
+ }
+ else
+ {
+ for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
+ {
+ input_offset.set(0, 0);
+ int32_t c = 0;
+ for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
+ {
+ auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+ wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
+ }
+ for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
+ {
+ *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+ }
+ }
+ }
+ }
+ else
+ {
+ // Use memcpy if the elements don't need converting to float.
+ if(std::is_same<T, float>::value)
+ {
+ memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
+ reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
+ (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
+ }
+ else
+ {
+ int32_t x = 0;
+ int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+ float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+ for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
+ {
+ auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+ wrapper::vstore(output_start_ptr + x, in);
+ }
+ for(; x < limit; ++x, ++input_offset[0])
+ {
+ *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+ }
+ }
+ }
+}
+
+inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
+ int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+{
+ auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
+ int32_t x = 0;
+ int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+ float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+ for(; x <= limit - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(output_start_ptr + x, in);
+ }
+ for(; x < limit; ++x)
+ {
+ *(output_start_ptr + x) = extrapolation_value;
+ }
+}
+
+template <bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after>
+inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value,
+ const std::array<uint32_t, 2> &rows_out_of_bounds, const std::array<uint32_t, 2> &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function)
+{
+ // Output is always float.
+ const int window_step_x = 16 / sizeof(float);
+ auto *output_ptr = reinterpret_cast<float *>(output->buffer());
+ // Output window:
+ // --------------------------------
+ // | Out of bounds |
+ // | rows before |
+ // |------------------------------|
+ // | Out of | In | Out of |
+ // | bounds | bounds | bounds |
+ // | cols | elements | cols |
+ // | before | copied | after |
+ // | | from input | |
+ // --------------------------------
+ // | Out of bounds |
+ // | rows after |
+ // |------------------------------|
+ // Fill all output rows that have no elements that are within the input bounds with the extrapolation value.
+ // First for the rows before the in bounds rows.
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
+ output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
+ // Iterate through each row that has any elements within the input bounds.
+ for(uint32_t row = rows_out_of_bounds[0]; static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
+ ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
+ {
+ // Fill all elements in the row that are out of bounds with the extrapolation value.
+ // First for the elements before the in bounds elements.
+ if(has_cols_out_of_bounds_before)
+ {
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
+ }
+ // Copy all elements within the input bounds from the input tensor.
+ if(has_cols_in_bounds)
+ {
+ (*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0], output->info()->dimension(1) - cols_out_of_bounds[1]);
+ }
+ // Fill all elements after the in bounds elements with the extrapolation value.
+ if(has_cols_out_of_bounds_after)
+ {
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
+ }
+ output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
+ }
+ // Fill all rows after the in bounds elements with the extrapolation value.
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
+}
+} // namespace
+
+NECropKernel::NECropKernel()
+ : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds(),
+ _in_bounds_crop_functions(), _in_bounds_crop_function(nullptr), _crop_function(nullptr)
+{
+}
+
+void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value));
+
+ _input = input;
+ _crop_boxes = crop_boxes;
+ _box_ind = box_ind;
+ _output = output;
+ _crop_box_ind = crop_box_ind;
+ _extrapolation_value = extrapolation_value;
+
+ const static std::map<std::pair<DataType, bool>, std::pair<NECropKernel::InBoundsCropFunction *, NECropKernel::InBoundsCropFunction *>> in_map_function =
+ {
+ { { DataType::F32, false }, { &in_bounds_crop_window<float, false, false>, &in_bounds_crop_window<float, false, true> } },
+ { { DataType::F32, true }, { &in_bounds_crop_window<float, true, false>, &in_bounds_crop_window<float, true, true> } },
+ { { DataType::U16, false }, { &in_bounds_crop_window<uint16_t, false, false>, &in_bounds_crop_window<uint16_t, false, true> } },
+ { { DataType::U16, true }, { &in_bounds_crop_window<uint16_t, true, false>, &in_bounds_crop_window<uint16_t, true, true> } },
+ { { DataType::S16, false }, { &in_bounds_crop_window<int16_t, false, false>, &in_bounds_crop_window<int16_t, false, true> } },
+ { { DataType::S16, true }, { &in_bounds_crop_window<int16_t, true, false>, &in_bounds_crop_window<int16_t, true, true> } },
+ { { DataType::U32, false }, { &in_bounds_crop_window<uint32_t, false, false>, &in_bounds_crop_window<uint32_t, false, true> } },
+ { { DataType::U32, true }, { &in_bounds_crop_window<uint32_t, true, false>, &in_bounds_crop_window<uint32_t, true, true> } },
+ { { DataType::S32, false }, { &in_bounds_crop_window<int32_t, false, false>, &in_bounds_crop_window<int32_t, false, true> } },
+ { { DataType::S32, true }, { &in_bounds_crop_window<int32_t, true, false>, &in_bounds_crop_window<int32_t, true, true> } },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ { { DataType::F16, false }, { &in_bounds_crop_window<float16_t, false, false>, &in_bounds_crop_window<float16_t, false, true> } },
+ { { DataType::F16, false }, { &in_bounds_crop_window<float16_t, true, false>, &in_bounds_crop_window<float16_t, true, true> } }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ };
+
+ auto in_it = in_map_function.find({ input->info()->data_type(), input->info()->dimension(0) == 1 });
+
+ if(in_it != in_map_function.end())
+ {
+ _in_bounds_crop_functions = in_it->second;
+ }
+}
+
+Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
+{
+ ARM_COMPUTE_UNUSED(extrapolation_value);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
+ ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind);
+ ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind);
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->has_padding());
+ }
+ return Status{};
+}
+
+void NECropKernel::configure_output_shape()
+{
+ // _crop_box_ind is used to index _crop_boxes and retrieve the appropriate crop box.
+ // The crop box is specified by normalized coordinates [y0, x0, y1, x1].
+ const float x0 = *reinterpret_cast<const float *>(_crop_boxes->ptr_to_element(Coordinates(1, _crop_box_ind)));
+ const float y0 = *reinterpret_cast<const float *>(_crop_boxes->ptr_to_element(Coordinates(0, _crop_box_ind)));
+ const float x1 = *reinterpret_cast<const float *>(_crop_boxes->ptr_to_element(Coordinates(3, _crop_box_ind)));
+ const float y1 = *reinterpret_cast<const float *>(_crop_boxes->ptr_to_element(Coordinates(2, _crop_box_ind)));
+ // The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers.
+ _start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+ _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+ const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1);
+ _output->info()->set_tensor_shape(out_shape);
+
+ _in_bounds_crop_function = _start[0] <= _end[0] ? _in_bounds_crop_functions.first : _in_bounds_crop_functions.second;
+
+ bool is_width_flipped = _end[0] < _start[0];
+ bool is_height_flipped = _end[1] < _start[1];
+ if(is_height_flipped)
+ {
+ _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(2))) :
+ 0;
+ _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
+ static_cast<uint32_t>(_output->info()->dimension(2))) :
+ 0;
+ }
+ else
+ {
+ _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
+ static_cast<uint32_t>(_output->info()->dimension(2))) :
+ 0;
+ _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(2))) :
+ 0;
+ }
+ if(is_width_flipped)
+ {
+ _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(1))) :
+ 0;
+ _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
+ static_cast<uint32_t>(_output->info()->dimension(1))) :
+ 0;
+ }
+ else
+ {
+ _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
+ static_cast<uint32_t>(_output->info()->dimension(1))) :
+ 0;
+ _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(1))) :
+ 0;
+ }
+
+ const static std::map<std::tuple<bool, bool, bool, bool>, NECropKernel::CropFunction *> map_function =
+ {
+ { std::make_tuple(false, false, false, false), &execute_window<false, false, false, false> },
+ { std::make_tuple(false, false, false, true), &execute_window<false, false, false, true> },
+ { std::make_tuple(false, false, true, false), &execute_window<false, false, true, false> },
+ { std::make_tuple(false, false, true, true), &execute_window<false, false, true, true> },
+ { std::make_tuple(false, true, false, false), &execute_window<false, true, false, false> },
+ { std::make_tuple(false, true, false, true), &execute_window<false, true, false, true> },
+ { std::make_tuple(false, true, true, false), &execute_window<false, true, true, false> },
+ { std::make_tuple(false, true, true, true), &execute_window<false, true, true, true> },
+ { std::make_tuple(true, false, false, false), &execute_window<true, false, false, false> },
+ { std::make_tuple(true, false, false, true), &execute_window<true, false, false, true> },
+ { std::make_tuple(true, false, true, false), &execute_window<true, false, true, false> },
+ { std::make_tuple(true, false, true, true), &execute_window<true, false, true, true> },
+ { std::make_tuple(true, true, false, false), &execute_window<true, true, false, false> },
+ { std::make_tuple(true, true, false, true), &execute_window<true, true, false, true> },
+ { std::make_tuple(true, true, true, false), &execute_window<true, true, true, false> },
+ { std::make_tuple(true, true, true, true), &execute_window<true, true, true, true> },
+ };
+
+ auto it = map_function.find(std::make_tuple(is_height_flipped,
+ _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1),
+ _cols_out_of_bounds[0] > 0,
+ _cols_out_of_bounds[1] > 0));
+
+ if(it != map_function.end())
+ {
+ _crop_function = it->second;
+ }
+
+ INEKernel::configure(calculate_max_window(*_output->info()));
+}
+
+void NECropKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(window, info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
+ ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
+
+ uint32_t batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
+ Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
+ _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
+ (*_crop_function)(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, _in_bounds_crop_function);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 8352c94..b360e9e 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -42,18 +42,13 @@
namespace
{
template <typename T>
-void depth_concat(const ITensor *in, ITensor *out, std::pair<int, int> start_xy, int depth_offset, const Window &window)
+void depth_concat(const ITensor *in, ITensor *out, int depth_offset, const Window &window)
{
- const int start_x = start_xy.first;
- const int start_y = start_xy.second;
-
// Offset input
- const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1];
- uint8_t *input_ptr = in->buffer() + input_offset_to_first_elements_in_bytes;
+ uint8_t *input_ptr = in->buffer() + in->info()->offset_first_element_in_bytes();
// Offset output
- const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2];
- uint8_t *output_ptr = out->buffer() + output_offset_to_first_elements_in_bytes;
+ uint8_t *output_ptr = out->buffer() + out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2];
Iterator input(in, window);
Iterator output(out, window);
@@ -88,19 +83,13 @@
{
ARM_COMPUTE_UNUSED(depth_offset);
- // Configure kernel window
- const int left_right = (output->dimension(0) - input->dimension(0)) / 2;
- const int top_bottom = (output->dimension(1) - input->dimension(1)) / 2;
-
const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
- const unsigned int num_elems_read_per_iteration = 16 / input->element_size();
- const unsigned int num_rows_read_per_iteration = 1;
// The window needs to be based on input as we copy all the depths of input
Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1));
- AccessWindowRectangle input_access(input, -left_right, -top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -116,30 +105,20 @@
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) > output->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) > output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
- // The gaps between the two lowest dimensions of input and output need to be divisible by 2
- // Otherwise it is not clear how the padding should be added onto the input tensor
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) - input->dimension(0)) % 2);
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(1) - input->dimension(1)) % 2);
-
return Status{};
}
} // namespace
NEDepthConcatenateLayerKernel::NEDepthConcatenateLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+ : _func(nullptr), _input(nullptr), _output(nullptr), _depth_offset(0)
{
}
-BorderSize NEDepthConcatenateLayerKernel::border_size() const
-{
- return BorderSize(_top_bottom, _left_right);
-}
-
void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -149,8 +128,6 @@
_input = input;
_output = output;
_depth_offset = depth_offset;
- _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
- _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
switch(input->info()->data_type())
{
@@ -190,5 +167,5 @@
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window);
+ (*_func)(_input, _output, _depth_offset, window);
}
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 5433755..cbc90a0 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -148,7 +148,7 @@
const float32x4_t scale = vdupq_n_f32(_input->info()->quantization_info().scale);
const int32x4_t offset = vdupq_n_s32(_input->info()->quantization_info().offset);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
const uint16x8x2_t texels_u16 =
@@ -184,7 +184,7 @@
const float16x8_t scale = vdupq_n_f16(static_cast<float16_t>(_input->info()->quantization_info().scale));
const int16x8_t offset = vdupq_n_s16(static_cast<int16_t>(_input->info()->quantization_info().offset));
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
const int16x8x2_t texels_s16 =
@@ -216,7 +216,7 @@
case DataType::S16:
{
/* Up-conversion U8 -> S16 */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
@@ -237,7 +237,7 @@
case DataType::S32:
{
/* Up-conversion U8 -> S32 */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
@@ -260,7 +260,7 @@
case DataType::U16:
{
/* Up-conversion U8 -> U16 */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
@@ -294,7 +294,7 @@
/* Down-conversion S16 -> U8 */
if(ConvertPolicy::SATURATE == _policy)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const int16x8x2_t texels =
{
@@ -310,7 +310,7 @@
}
else
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const int16x8x2_t texels =
{
@@ -332,7 +332,7 @@
const int32x4_t b = vdupq_n_s32(_shift);
/* Up-conversion S16 -> S32 */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const int16x8x2_t texels =
{
@@ -376,7 +376,7 @@
/* Down-conversion U16 -> U8 */
if(ConvertPolicy::SATURATE == _policy)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint16x8x2_t texels =
{
@@ -392,7 +392,7 @@
}
else
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint16x8x2_t texels =
{
@@ -413,7 +413,7 @@
const int32x4_t b = vdupq_n_s32(_shift);
/* Up-conversion U16 -> U32 */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint16x8x2_t texels =
{
@@ -504,7 +504,7 @@
const int32x4_t zero_val_vec = vdupq_n_s32(0);
/* Down-conversion F32 -> QASYMM8 */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const float32x4x4_t texels =
{
@@ -535,7 +535,7 @@
const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
/* Down-conversion F32 -> F16 */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const float32x4x4_t texels =
{
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 6071153..fdafc2d 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -40,11 +40,8 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-using namespace arm_compute::detail;
-using namespace arm_compute::misc::shape_calculator;
-using namespace depthwise;
-
+namespace arm_compute
+{
namespace
{
template <typename T1, typename T2, unsigned int stridex>
@@ -52,7 +49,7 @@
{
public:
static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
{
const int input_offset = -input->info()->quantization_info().offset;
const int weights_offset = -weights->info()->quantization_info().offset;
@@ -60,12 +57,13 @@
const int input_stride_x = input->info()->strides_in_bytes().x();
const int input_stride_y = input->info()->strides_in_bytes().y();
const int input_stride_z = input->info()->strides_in_bytes().z();
+ const int input_stride_w = input->info()->strides_in_bytes()[3];
const int output_stride_y = output->info()->strides_in_bytes().y();
const int kernel_stride_y = weights->info()->strides_in_bytes().y();
const int kernel_stride_z = weights->info()->strides_in_bytes().z();
const int output_w = output->info()->dimension(0);
const int output_h = output->info()->dimension(1);
- const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+ const int delta_input = detail::get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
const unsigned int conv_pad_x = conv_info.pad_left();
const unsigned int conv_pad_y = conv_info.pad_top();
@@ -77,9 +75,10 @@
// setup input window for the iterator
Window window_in = window;
- // we just want execute_window_loop to iterate over the dimensions > 2, so we set the first 2 dimensions to 0
+ // Iteration of input is taken care of in execute_window_loop
window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
Window window_k = calculate_max_window(*weights->info(), Steps(1u));
@@ -94,58 +93,67 @@
int ih = 0;
int oh = 0;
- const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y - (id.z() - id.z() / depth_multiplier) * input_stride_z;
+ const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y + (id.z() / depth_multiplier) * input_stride_z + input_stride_w * id[3];
const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
const auto ptr_weights_r0 = reinterpret_cast<const T1 *>(ptr_weights_base);
const auto ptr_weights_r1 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y);
const auto ptr_weights_r2 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y * 2);
- const auto vw_r0 = load_matrix_row(ptr_weights_r0, weights_offset);
- const auto vw_r1 = load_matrix_row(ptr_weights_r1, weights_offset);
- const auto vw_r2 = load_matrix_row(ptr_weights_r2, weights_offset);
+ const auto vw_r0 = detail::load_matrix_row(ptr_weights_r0, weights_offset);
+ const auto vw_r1 = detail::load_matrix_row(ptr_weights_r1, weights_offset);
+ const auto vw_r2 = detail::load_matrix_row(ptr_weights_r2, weights_offset);
for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
{
auto in_top = reinterpret_cast<const T1 *>(input_ptr + (ih + 0) * input_stride_y);
- auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + 1) * input_stride_y);
- auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2) * input_stride_y);
- auto p_out = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y);
+ auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + dilation.y()) * input_stride_y);
+ auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2 * dilation.y()) * input_stride_y); //uint8
+ auto p_out = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y); //int32
for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
in_top += delta_input, in_mid += delta_input, in_low += delta_input,
p_out += num_elems_written_per_iteration)
{
- auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset);
- store_results<stridex>(p_out, vres);
+ if(dilation == Size2D(1U, 1U))
+ {
+ auto vres = detail::convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset);
+ detail::store_results<stridex>(p_out, vres);
+ }
+ else
+ {
+ auto vres = detail::convolve_3x3_dilation<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, dilation.x(), input_offset);
+ detail::store_results<stridex>(p_out, vres);
+ }
}
}
},
- in, out);
+ out);
}
};
template <typename T1, typename T2>
inline void convolve_3x3(const Window &window, unsigned int num_elems_written_per_iteration,
- const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+ const ITensor *input, const ITensor *weights, ITensor *output,
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
{
const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
switch(conv_stride_x)
{
case 1:
- convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
+ convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
break;
case 2:
- convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
+ convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
break;
case 3:
- convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
+ convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
break;
default:
ARM_COMPUTE_ERROR("Not implemented");
}
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -156,15 +164,11 @@
const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
-
- if(!is_optimized)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
- }
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
if(output->total_size() != 0)
{
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
if(is_data_type_quantized_asymmetric(input->data_type()))
@@ -180,95 +184,63 @@
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized,
- IDepthwiseConvolution *convolver = nullptr)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ const Size2D &dilation)
{
Window win;
bool window_changed = false;
- if(is_optimized)
+ // Get convolved dimensions
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
+
+ // Configure kernel window (generic)
+ const unsigned int conv_stride_x = conv_info.stride().first;
+ const unsigned int conv_stride_y = conv_info.stride().second;
+ const unsigned int conv_pad_top = conv_info.pad_top();
+ const unsigned int conv_pad_left = conv_info.pad_left();
+
+ unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
+ unsigned int num_elems_read_per_iteration = 0;
+
+ switch(input->data_type())
{
- if(convolver != nullptr)
- {
- auto win_last = convolver->get_window();
- win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-
- // Auto-configure output
- bool same_padding = conv_info.has_padding();
- TensorShape output_shape{ input->tensor_shape() };
-
- output_shape.set(1, convolver->output_size(output_shape.y(), same_padding)); // Set width
- output_shape.set(2, convolver->output_size(output_shape.z(), same_padding)); // Set height
-
- const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
-
- // Configure window (optimised)
- // Set padding in channels
- const int num_channels = weights->dimension(0);
- if((num_channels >= 128) && (num_channels % 16 == 0))
- {
- input->extend_padding(PaddingSize(0, 4, 0, 0));
- weights->extend_padding(PaddingSize(0, 4, 0, 0));
- output->extend_padding(PaddingSize(0, 4, 0, 0));
- }
- }
- }
- else
- {
- // Get convolved dimensions
- const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
- const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
-
- // Configure kernel window (generic)
- const unsigned int conv_stride_x = conv_info.stride().first;
- const unsigned int conv_stride_y = conv_info.stride().second;
- const unsigned int conv_pad_top = conv_info.pad_top();
- const unsigned int conv_pad_left = conv_info.pad_left();
-
- unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
- unsigned int num_elems_read_per_iteration = 0;
-
- switch(input->data_type())
- {
- case DataType::QASYMM8:
- num_elems_read_per_iteration = 16;
- break;
+ case DataType::QASYMM8:
+ num_elems_read_per_iteration = 16 + 15 * (dilation.x() - 1);
+ break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- num_elems_read_per_iteration = 24;
- break;
+ case DataType::F16:
+ num_elems_written_per_iteration = 32 >> conv_stride_x;
+ num_elems_read_per_iteration = 24 + 23 * (dilation.x() - 1);
+ break;
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- num_elems_read_per_iteration = 12;
- break;
- default:
- ARM_COMPUTE_ERROR("Data type not supported.");
- }
-
- // Configure kernel window
- win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-
- AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3, conv_stride_x, conv_stride_y);
- AccessWindowStatic weights_access(weights, 0, 0, 3, 3);
- AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
-
- window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ case DataType::F32:
+ num_elems_read_per_iteration = 12 + 11 * (dilation.x() - 1);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported.");
}
+ // Configure kernel window
+ win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+
+ AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3 + 2 * (dilation.y() - 1), conv_stride_x, conv_stride_y);
+ AccessWindowStatic weights_access(weights, 0, 0, 3, 3);
+ AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
+
+ window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
- : _border_size(0), _input(), _output(), _weights(), _conv_info(), _convolver(nullptr), _num_elems_written_per_iteration(0), _run_optimized(false), _depth_multiplier(1)
+ : _border_size(0), _input(), _output(), _weights(), _conv_info(), _num_elems_written_per_iteration(0), _depth_multiplier(1), _dilation()
{
}
@@ -278,33 +250,40 @@
}
void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- DataLayout data_layout)
+ const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, dilation));
_input = input;
_output = output;
_weights = weights;
_conv_info = conv_info;
_depth_multiplier = depth_multiplier;
- _convolver = nullptr;
-
- _run_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
- conv_info,
- input->info()->data_type(), depth_multiplier,
- data_layout);
-
- (_run_optimized) ? configure_optimized() : configure_generic();
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ case DataType::F32:
+ _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
+ break;
+ case DataType::F16:
+ _num_elems_written_per_iteration = 32 >> _conv_info.stride().first;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported.");
+ }
+ _border_size = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
+ _dilation = dilation;
+ auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, dilation);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
}
-Status NEDepthwiseConvolutionLayer3x3Kernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+Status NEDepthwiseConvolutionLayer3x3Kernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ const Size2D &dilation)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-
- bool is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->tensor_shape(), conv_info, input->data_type(), depth_multiplier, input->data_layout());
-
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info, depth_multiplier, is_optimized));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, is_optimized).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info, depth_multiplier, dilation));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, dilation).first);
return Status{};
}
@@ -313,213 +292,23 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_UNUSED(info);
- (_run_optimized) ? run_optimized(window, info) : run_generic(window, info);
-}
-
-bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, unsigned int depth_multiplier, DataLayout data_layout)
-{
- // Reshape input shape if in NHWC format
- TensorShape in_shape{ input_shape };
- if(data_layout == DataLayout::NHWC)
- {
- in_shape.set(Window::DimX, input_shape.y());
- in_shape.set(Window::DimY, input_shape.z());
- in_shape.set(Window::DimZ, input_shape.x());
- }
-
- // Check supported data type
- bool supported_datatype = is_data_type_float(dt) || is_data_type_quantized(dt);
-
- // Check for supported strides
- const auto &strides = conv_info.stride();
- bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
-
- // Check for supported padding
- const auto pad_top = conv_info.pad_top();
- const auto pad_right = conv_info.pad_right();
- const auto pad_bottom = conv_info.pad_bottom();
- const auto pad_left = conv_info.pad_left();
- PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
- bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
- bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
- bool supported_padding = is_same_padding || is_valid_padding;
-
- return supported_datatype && supported_strides && supported_padding && (depth_multiplier == 1);
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
-{
- ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(_input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
- ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
-
- _convolver = create_convolver_object(_conv_info, _weights, _input, _output, true);
- if(_convolver)
- {
- _convolver->set_offsets(-_input->info()->quantization_info().offset, -_weights->info()->quantization_info().offset);
- }
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, _run_optimized));
-
- _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
- _border_size = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
-
- auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, false);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::configure_optimized()
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, _run_optimized));
-
- _border_size = BorderSize(0, 0);
- _convolver = create_convolver_object(_conv_info, _weights, _input, _output);
-
- auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, true, _convolver.get());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::run_generic(const Window &window, const ThreadInfo &info)
-{
ARM_COMPUTE_UNUSED(info);
switch(_input->info()->data_type())
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+ convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
break;
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+ convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
break;
case DataType::QASYMM8:
- convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+ convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
break;
default:
ARM_COMPUTE_ERROR("Not implemented");
}
}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::run_optimized(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON(!_convolver);
-
- const size_t start = window.x().start();
- const size_t end = window.x().end();
- _convolver->run(start, end);
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> NEDepthwiseConvolutionLayer3x3Kernel::create_convolver_object(PadStrideInfo conv_info,
- const ITensor *w,
- const ITensor *in,
- ITensor *out,
- bool setup_strides)
-{
- const DataType dt = in->info()->data_type();
- const TensorShape shape = in->info()->tensor_shape();
- const int in_rows = shape.z();
- const int in_cols = shape.y();
- const int n_batches = shape[3];
- const int n_channels = shape.x();
- const bool padding_same = conv_info.has_padding();
- const int weight_col_stride = (setup_strides) ? w->info()->strides_in_bytes().y() / w->info()->element_size() : 0;
- const int weight_row_stride = (setup_strides) ? w->info()->strides_in_bytes().z() / w->info()->element_size() : 0;
- const int input_col_stride = (setup_strides) ? in->info()->strides_in_bytes().y() / in->info()->element_size() : 0;
- const int input_row_stride = (setup_strides) ? in->info()->strides_in_bytes().z() / in->info()->element_size() : 0;
- const int input_batch_stride = (setup_strides) ? in->info()->strides_in_bytes()[3] / in->info()->element_size() : 0;
- const int output_col_stride = (setup_strides) ? out->info()->strides_in_bytes().y() / out->info()->element_size() : 0;
- const int output_row_stride = (setup_strides) ? out->info()->strides_in_bytes().z() / out->info()->element_size() : 0;
- const int output_batch_stride = (setup_strides) ? out->info()->strides_in_bytes()[3] / out->info()->element_size() : 0;
-
- const auto stride_x = conv_info.stride().first;
- switch(dt)
- {
- case DataType::QASYMM8:
- {
- switch(stride_x)
- {
- case 1:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>>(
- n_batches, in_rows, in_cols, n_channels, padding_same,
- reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
- in->ptr_to_element(Coordinates()),
- reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
- weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
- output_col_stride, output_row_stride, output_batch_stride);
- case 2:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>>(
- n_batches, in_rows, in_cols, n_channels, padding_same,
- reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
- in->ptr_to_element(Coordinates()),
- reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
- weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
- output_col_stride, output_row_stride, output_batch_stride);
- default:
- return nullptr;
- }
- break;
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- switch(stride_x)
- {
- case 1:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>>(
- n_batches, in_rows, in_cols, n_channels, padding_same,
- reinterpret_cast<const float16_t *>(w->ptr_to_element(Coordinates())),
- reinterpret_cast<float16_t *>(in->ptr_to_element(Coordinates())),
- reinterpret_cast<float16_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
- weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
- output_col_stride, output_row_stride, output_batch_stride);
- case 2:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>>(
- n_batches, in_rows, in_cols, n_channels, padding_same,
- reinterpret_cast<const float16_t *>(w->ptr_to_element(Coordinates())),
- reinterpret_cast<float16_t *>(in->ptr_to_element(Coordinates())),
- reinterpret_cast<float16_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
- weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
- output_col_stride, output_row_stride, output_batch_stride);
- default:
- return nullptr;
- }
- break;
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- {
- switch(stride_x)
- {
- case 1:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>>(
- n_batches, in_rows, in_cols, n_channels, padding_same,
- reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
- reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
- reinterpret_cast<float *>(out->ptr_to_element(Coordinates())), weight_col_stride,
- weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
- output_col_stride, output_row_stride, output_batch_stride);
- case 2:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>>(
- n_batches, in_rows, in_cols, n_channels, padding_same,
- reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
- reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
- reinterpret_cast<float *>(out->ptr_to_element(Coordinates())), weight_col_stride,
- weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
- output_col_stride, output_row_stride, output_batch_stride);
- default:
- return nullptr;
- }
- break;
- }
- default:
- return nullptr;
- }
-}
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 62373e3..88f8b31 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -38,7 +38,8 @@
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier, const Size2D &dilation)
{
ARM_COMPUTE_UNUSED(conv_info);
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
@@ -48,6 +49,7 @@
ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || dilation.y() < 1);
return Status{};
}
@@ -84,7 +86,7 @@
Iterator out(_output, window_out);
const int full_length = input_w + pad_left + pad_right;
- const int max_initial_x = stride_x * (((full_length - _kernel_dims.width) / stride_x) + 1);
+ const int max_initial_x = stride_x * (((full_length - (_kernel_dims.width + (_kernel_dims.width - 1) * (_dilation.x() - 1))) / stride_x) + 1);
// Define pad value
auto zero = static_cast<T>(0);
@@ -103,12 +105,12 @@
// Get pointers
const uint8_t *const input_ptr = in.ptr() + id.z() / _depth_multiplier * input_stride_z;
auto output_ptr = reinterpret_cast<T *>(out.ptr());
- const int height = src_y + _kernel_dims.height;
- const int width = src_x + _kernel_dims.width;
+ const int height = src_y + (_kernel_dims.height + (_kernel_dims.height - 1) * (_dilation.y() - 1));
+ const int width = src_x + (_kernel_dims.width + (_kernel_dims.width - 1) * (_dilation.x() - 1));
- for(int y = src_y; y < height; ++y)
+ for(int y = src_y; y < height; y += _dilation.y())
{
- for(int x = src_x; x < width; ++x, ++output_ptr)
+ for(int x = src_x; x < width; x += _dilation.x(), ++output_ptr)
{
if(x < 0 || x >= input_w || y < 0 || y >= input_h)
{
@@ -130,15 +132,16 @@
}
NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias(), _depth_multiplier(1)
+ : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias(), _depth_multiplier(1), _dilation()
{
}
-void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+ const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
_input = input;
_output = output;
@@ -146,6 +149,7 @@
_conv_info = conv_info;
_has_bias = has_bias;
_depth_multiplier = depth_multiplier;
+ _dilation = dilation;
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
@@ -172,10 +176,11 @@
INEKernel::configure(win);
}
-Status NEDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+Status NEDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier,
+ const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier, dilation));
return Status{};
}
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index 47c895c..1520225 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,83 +24,143 @@
#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
#include <arm_neon.h>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
if(output->tensor_shape().total_size() > 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
}
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
{
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps());
+
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
- constexpr unsigned int num_elems_processed_per_iteration = 8;
+ // NEDequantizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
- // Configure window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- AccessWindowStatic min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+ return std::make_tuple(Status{}, win);
+}
- // Update window and padding
- bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+template <typename T>
+inline void store_result(T *ptr, const float32x4x4_t &v)
+{
+ ARM_COMPUTE_UNUSED(ptr, v);
+}
- output_access.set_valid_region(win, input->valid_region());
+template <>
+inline void store_result<float>(float *ptr, const float32x4x4_t &v)
+{
+ wrapper::vstore(ptr, v.val[0]);
+ wrapper::vstore(ptr + 4, v.val[1]);
+ wrapper::vstore(ptr + 8, v.val[2]);
+ wrapper::vstore(ptr + 12, v.val[3]);
+}
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_tuple(err, win);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
+{
+ wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+ wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename T>
+void run_dequantization(const ITensor *input, ITensor *output, const Window &window)
+{
+ const QuantizationInfo &qinfo = input->info()->quantization_info();
+
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Create iterators
+ Iterator in(input, win_collapsed);
+ Iterator out(output, win_collapsed);
+
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
+ {
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(in_ptr + x);
+ const auto vdeq = vdequantize(vin, qinfo);
+
+ store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ uint8_t val = *(in_ptr + x);
+ *(out_ptr + x) = static_cast<T>(qinfo.dequantize(val));
+ }
+ },
+ in, out);
}
} // namespace
NEDequantizationLayerKernel::NEDequantizationLayerKernel()
- : _input(nullptr), _output(nullptr), _min_max(nullptr)
+ : _input(nullptr), _output(nullptr)
{
}
-void NEDequantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+void NEDequantizationLayerKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
- _input = input;
- _output = output;
- _min_max = min_max;
+ _input = input;
+ _output = output;
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
+ auto win_config = validate_and_configure_window(input->info(), output->info());
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
INEKernel::configure(std::get<1>(win_config));
}
-Status NEDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status NEDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
-
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
return Status{};
}
@@ -110,53 +170,18 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- Window window_input_output(window);
- window_input_output.set(3, Window::Dimension(0, 1, 1));
-
- Window window_min_max;
- window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
- window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(_input, window_input_output);
- Iterator output(_output, window_input_output);
- Iterator min_max(_min_max, window_min_max);
-
- execute_window_loop(window_min_max, [&](const Coordinates & id_batch)
+ switch(_output->info()->data_type())
{
- // Get the min and max
- const float min = *(reinterpret_cast<const float *>(min_max.ptr()) + 0);
- const float max = *(reinterpret_cast<const float *>(min_max.ptr()) + 1);
-
- const float32x4_t vmin = vdupq_n_f32(min);
- const float range = max - min;
- const float32x4_t scaling = vdupq_n_f32(range / 255.0f);
-
- // Uniformly map values to range 8bit integers, i.e. [min, max] -> [0, 255]
- execute_window_loop(window_input_output, [&](const Coordinates & id)
- {
- // Get the input values
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
-
- const uint8x8_t val_u8 = vld1_u8(input_ptr);
- const uint16x8_t val_u16 = vmovl_u8(val_u8);
- const uint32x4_t val_u32_low = vmovl_u16(vget_low_u16(val_u16));
- const uint32x4_t val_u32_high = vmovl_u16(vget_high_u16(val_u16));
- float32x4_t val_low = vcvtq_f32_u32(val_u32_low);
- float32x4_t val_high = vcvtq_f32_u32(val_u32_high);
-
- // Dequantize -> (q / 255.0 * range) + min
- val_low = vmulq_f32(val_low, scaling);
- val_high = vmulq_f32(val_high, scaling);
- val_low = vaddq_f32(val_low, vmin);
- val_high = vaddq_f32(val_high, vmin);
-
- const float32x4x2_t dequantized = vuzpq_f32(val_low, val_high);
-
- // Store the dequantized values
- auto output_ptr = reinterpret_cast<float *>(output.ptr() + id_batch[1] * _output->info()->strides_in_bytes()[3]);
- vst2q_f32(output_ptr, dequantized);
- },
- input, output);
- },
- min_max);
-}
\ No newline at end of file
+ case DataType::F32:
+ run_dequantization<float>(_input, _output, window);
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ run_dequantization<float16_t>(_input, _output, window);
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index cfed324..1d7237a 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -124,7 +124,7 @@
Iterator out_x(_output_x, window);
/* Apply 1-D centered point discrete derivative mask ([-1 0 1]) along the X direction */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
/* Load left and right data */
const uint8x16_t l_data = vld1q_u8(in.ptr() - 1);
@@ -153,7 +153,7 @@
const size_t stride = _input->info()->strides_in_bytes()[1];
/* Apply 1-D centered point discrete derivative mask ([-1 0 1]^T) along the Y direction */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
/* Load top and bottom data */
const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
@@ -183,7 +183,7 @@
const size_t stride = _input->info()->strides_in_bytes()[1];
/* Apply 1-D centered point discrete derivative masks ([-1 0 1] and [-1 0 1]^T) along the X and Y directions */
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
/* Load top, bottom, left and right data */
const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
index 3ee00a4..e761815 100644
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -94,7 +94,8 @@
uint8x8_t bot_high_data = vget_high_u8(bot_data);
uint8x8_t bot_low_data = vget_low_u8(bot_data);
- uint8x8_t p0, p1;
+ uint8x8_t p0;
+ uint8x8_t p1;
p0 = top_low_data;
p1 = vext_u8(top_low_data, top_high_data, 1);
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 162c4b1..d557cfa 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -192,12 +192,12 @@
execute_window_loop(window_out, [&](const Coordinates & id)
{
- const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
- uint8_t *out_ptr = out.ptr();
- int ih = 0;
- int oh = 0;
- float32x4_t accum0[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
- float32x4_t accum1[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+ const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
+ uint8_t *out_ptr = out.ptr();
+ int ih = 0;
+ int oh = 0;
+ std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+ std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
for(int oz = 0; oz < range_z; ++oz)
{
accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 09836f1..7e11393 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -266,7 +266,7 @@
if(in_place) // In place accumulate
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
// Get bias and pointer to input
const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
@@ -287,7 +287,7 @@
else // Out of place accumulate
{
Iterator out(output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
// Get bias and pointer to input
const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
@@ -363,7 +363,7 @@
Iterator in(input, window);
Iterator out(output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
// Get bias and pointer to input
const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
@@ -399,7 +399,7 @@
Iterator bi(bias, window_bias);
Iterator out(output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
// Get bias and pointer to input
const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
@@ -433,7 +433,7 @@
Iterator in(input, window);
Iterator out(output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
// Get pointer to input
const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
index aa458c2..6b87ea0 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
@@ -506,7 +506,7 @@
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
@@ -531,7 +531,7 @@
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
@@ -599,7 +599,7 @@
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
@@ -640,7 +640,7 @@
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
index 7ecc4d1..34696d8 100644
--- a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
@@ -87,7 +87,7 @@
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
index 88c20f8..2a538ec 100644
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -94,7 +94,8 @@
uint8x8_t bot_high_data = vget_high_u8(bot_data);
uint8x8_t bot_low_data = vget_low_u8(bot_data);
- uint8x8_t p0, p1;
+ uint8x8_t p0;
+ uint8x8_t p1;
p0 = top_low_data;
p1 = vext_u8(top_low_data, top_high_data, 1);
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
new file mode 100644
index 0000000..cf77345
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+ ARM_COMPUTE_UNUSED(idx, config);
+
+ auto_init_if_empty(*output, input->clone()->set_num_channels(2));
+
+ Window win = calculate_max_window(*input, Steps());
+ input->set_valid_region(ValidRegion(Coordinates(), input->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFFTDigitReverseKernel::NEFFTDigitReverseKernel()
+ : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
+{
+}
+
+void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
+
+ _input = input;
+ _output = output;
+ _idx = idx;
+
+ const size_t axis = config.axis;
+ const bool is_conj = config.conjugate;
+ const bool is_input_complex = (input->info()->num_channels() == 2);
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), idx->info(), config);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+
+ if(axis == 0)
+ {
+ if(is_input_complex)
+ {
+ if(is_conj)
+ {
+ _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, true>;
+ }
+ else
+ {
+ _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, false>;
+ }
+ }
+ else
+ {
+ _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<false, false>;
+ }
+ }
+ else if(axis == 1)
+ {
+ if(is_input_complex)
+ {
+ if(is_conj)
+ {
+ _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, true>;
+ }
+ else
+ {
+ _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, false>;
+ }
+ }
+ else
+ {
+ _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<false, false>;
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+}
+
+Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+ return Status{};
+}
+
+template <bool is_input_complex, bool is_conj>
+void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window)
+{
+ const size_t N = _input->info()->dimension(0);
+
+ // Copy the look-up buffer to a local array
+ std::vector<unsigned int> buffer_idx(N);
+ std::copy_n(reinterpret_cast<unsigned int *>(_idx->buffer()), N, buffer_idx.data());
+
+ // Input/output iterators
+ Window slice = window;
+ slice.set(0, Window::DimX);
+ Iterator in(_input, slice);
+ Iterator out(_output, slice);
+
+ // Row buffers
+ std::vector<float> buffer_row_out(2 * N);
+ std::vector<float> buffer_row_in(2 * N);
+
+ execute_window_loop(slice, [&](const Coordinates &)
+ {
+ if(is_input_complex)
+ {
+ // Load
+ memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
+
+ // Shuffle
+ for(size_t x = 0; x < 2 * N; x += 2)
+ {
+ size_t idx = buffer_idx[x / 2];
+ buffer_row_out[x] = buffer_row_in[2 * idx];
+ buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
+ }
+ }
+ else
+ {
+ // Load
+ memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+
+ // Shuffle
+ for(size_t x = 0; x < N; ++x)
+ {
+ size_t idx = buffer_idx[x];
+ buffer_row_out[2 * x] = buffer_row_in[idx];
+ }
+ }
+
+ // Copy back
+ memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
+ },
+ in, out);
+}
+
+template <bool is_input_complex, bool is_conj>
+void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window)
+{
+ const size_t Nx = _input->info()->dimension(0);
+ const size_t Ny = _input->info()->dimension(1);
+
+ // Copy the look-up buffer to a local array
+ std::vector<unsigned int> buffer_idx(Ny);
+ std::copy_n(reinterpret_cast<unsigned int *>(_idx->buffer()), Ny, buffer_idx.data());
+
+ // Output iterator
+ Window slice = window;
+ slice.set(0, Window::DimX);
+ Iterator out(_output, slice);
+
+ // Row buffer
+ std::vector<float> buffer_row(Nx);
+
+ // Strides
+ const size_t stride_z = _input->info()->strides_in_bytes()[2];
+ const size_t stride_w = _input->info()->strides_in_bytes()[3];
+
+ execute_window_loop(slice, [&](const Coordinates & id)
+ {
+ auto *out_ptr = reinterpret_cast<float *>(out.ptr());
+ auto *in_ptr = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
+ const size_t y_shuffled = buffer_idx[id.y()];
+
+ if(is_input_complex)
+ {
+ // Shuffle the entire row into the output
+ memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+
+ // Conjugate if necessary
+ if(is_conj)
+ {
+ for(size_t x = 0; x < 2 * Nx; x += 2)
+ {
+ out_ptr[x + 1] = -out_ptr[x + 1];
+ }
+ }
+ }
+ else
+ {
+ // Shuffle the entire row into the buffer
+ memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
+
+ // Copy the buffer to the output, with a zero imaginary part
+ for(size_t x = 0; x < 2 * Nx; x += 2)
+ {
+ out_ptr[x] = buffer_row[x / 2];
+ }
+ }
+ },
+ out);
+}
+
+void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_UNUSED(info);
+ (this->*_func)(window);
+}
+
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
new file mode 100644
index 0000000..148bbe9
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <complex>
+#include <map>
+
+#include "arm_compute/core/NEON/wrapper/traits.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+namespace
+{
+// PI constant (from cmath)
+constexpr float kPi = float(M_PI);
+
+// Constant used in the fft_3 kernel
+constexpr float kSqrt3Div2 = 0.866025403784438;
+
+// Constants used in the fft_5 kernel
+constexpr float kW5_0 = 0.30901699437494f;
+constexpr float kW5_1 = 0.95105651629515f;
+constexpr float kW5_2 = 0.80901699437494f;
+constexpr float kW5_3 = 0.58778525229247f;
+
+// Constants used in the fft_7 kernel
+constexpr float kW7_0 = 0.62348980185873f;
+constexpr float kW7_1 = 0.78183148246802f;
+constexpr float kW7_2 = 0.22252093395631f;
+constexpr float kW7_3 = 0.97492791218182f;
+constexpr float kW7_4 = 0.90096886790241f;
+constexpr float kW7_5 = 0.43388373911755f;
+
+// Constant used in the fft_8 kernel
+constexpr float kSqrt2Div2 = 0.707106781186548;
+
+float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)
+{
+ using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
+
+ const float32x2_t mask = { -1.0, 1.0 };
+ const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+ const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+
+ float32x2_t res = wrapper::vmul(tmp0, b);
+
+ b = wrapper::vrev64(b);
+ b = wrapper::vmul(b, mask);
+ res = wrapper::vmla(res, tmp1, b);
+
+ return res;
+}
+
+float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)
+{
+ const float a_r = wrapper::vgetlane(a, 0);
+ const float a_i = wrapper::vgetlane(a, 1);
+
+ const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
+ return out;
+}
+
+float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_t d, float32x2_t e)
+{
+ const auto t0 = wrapper::vadd(a, b);
+ const auto t1 = wrapper::vadd(c, d);
+ const auto t2 = wrapper::vadd(t0, t1);
+ return wrapper::vadd(t2, e);
+}
+
+float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
+{
+ const auto t0 = wrapper::vadd(x1, x2);
+ const auto t1 = wrapper::vadd(x3, x4);
+ const auto t2 = wrapper::vadd(x5, x6);
+ const auto t00 = wrapper::vadd(t0, t1);
+ const auto t01 = wrapper::vadd(t2, x7);
+
+ return wrapper::vadd(t00, t01);
+}
+
+float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
+{
+ const auto t0 = wrapper::vadd(x1, x2);
+ const auto t1 = wrapper::vadd(x3, x4);
+ const auto t2 = wrapper::vadd(x5, x6);
+ const auto t3 = wrapper::vadd(x7, x8);
+ const auto t00 = wrapper::vadd(t0, t1);
+ const auto t01 = wrapper::vadd(t2, t3);
+
+ return wrapper::vadd(t00, t01);
+}
+
+void fft_2(float32x2_t &x, float32x2_t &y, float32x2_t &w)
+{
+ float32x2_t a = x;
+ float32x2_t b = c_mul_neon(w, y);
+
+ x = wrapper::vadd(a, b);
+ y = wrapper::vsub(a, b);
+}
+
+void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, const float32x2_t &w2)
+{
+ float32x2_t a = x;
+ float32x2_t b = c_mul_neon(w, y);
+ float32x2_t c = c_mul_neon(w2, z);
+
+ x = wrapper::vadd(a, b);
+ x = wrapper::vadd(x, c);
+
+ const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c));
+ const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c));
+
+ y = z = wrapper::vsub(a, v1);
+ y = wrapper::vadd(y, v2);
+ z = wrapper::vsub(z, v2);
+}
+
+void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3)
+{
+ float32x2_t a = x1;
+ float32x2_t b = c_mul_neon(w, x2);
+ float32x2_t c = c_mul_neon(w2, x3);
+ float32x2_t d = c_mul_neon(w3, x4);
+
+ const auto x11 = wrapper::vadd(a, b);
+ const auto x12 = wrapper::vadd(c, d);
+ x1 = wrapper::vadd(x11, x12);
+
+ const auto x21 = wrapper::vadd(a, c_mul_neon_img(b, -1));
+ const auto x22 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, 1.f));
+ x2 = wrapper::vadd(x21, x22);
+
+ const auto x31 = wrapper::vadd(a, wrapper::vneg(b));
+ const auto x32 = wrapper::vadd(c, wrapper::vneg(d));
+ x3 = wrapper::vadd(x31, x32);
+
+ const auto x41 = wrapper::vadd(a, c_mul_neon_img(b, 1));
+ const auto x42 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, -1));
+ x4 = wrapper::vadd(x41, x42);
+}
+
+void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4)
+{
+ const auto a = x1;
+ const auto b = c_mul_neon(w, x2);
+ const auto c = c_mul_neon(w2, x3);
+ const auto d = c_mul_neon(w3, x4);
+ const auto e = c_mul_neon(w4, x5);
+
+ const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b);
+ const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b);
+ const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b);
+ const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
+ const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
+ const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
+ const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
+ const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
+ const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
+ const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
+ const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
+ const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
+ const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
+
+ x1 = reduce_sum_5(a, b, c, d, e);
+ x2 = reduce_sum_5(a, b0, c0, d0, e0);
+ x3 = reduce_sum_5(a, b1, c1, d1, e1);
+ x4 = reduce_sum_5(a, b2, c2, d2, e2);
+ x5 = reduce_sum_5(a, b3, c3, d3, e3);
+}
+
+void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3,
+ const float32x2_t &w4,
+ const float32x2_t &w5, const float32x2_t &w6)
+{
+ const auto a = x1;
+ const auto b = c_mul_neon(w, x2);
+ const auto c = c_mul_neon(w2, x3);
+ const auto d = c_mul_neon(w3, x4);
+ const auto e = c_mul_neon(w4, x5);
+ const auto f = c_mul_neon(w5, x6);
+ const auto g = c_mul_neon(w6, x7);
+
+ const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b);
+ const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b);
+ const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b);
+ const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b);
+ const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b);
+ const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
+ const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
+ const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
+ const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
+ const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
+ const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
+ const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
+ const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
+ const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
+ const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
+ const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
+ const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
+ const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
+ const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
+ const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
+ const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
+
+ const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
+ const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
+ const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
+ const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
+ const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
+ const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
+
+ const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
+ const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
+ const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
+ const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
+ const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
+ const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
+
+ x1 = reduce_sum_7(a, b, c, d, e, f, g);
+ x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
+ x3 = reduce_sum_7(a, b1, c1, d1, e1, f1, g1);
+ x4 = reduce_sum_7(a, b2, c2, d2, e2, f2, g2);
+ x5 = reduce_sum_7(a, b3, c3, d3, e3, f3, g3);
+ x6 = reduce_sum_7(a, b4, c4, d4, e4, f4, g4);
+ x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
+}
+
+void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2,
+ const float32x2_t &w3,
+ const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6,
+ const float32x2_t &w7)
+{
+ const auto a = x1;
+ const auto b = c_mul_neon(w, x2);
+ const auto c = c_mul_neon(w2, x3);
+ const auto d = c_mul_neon(w3, x4);
+ const auto e = c_mul_neon(w4, x5);
+ const auto f = c_mul_neon(w5, x6);
+ const auto g = c_mul_neon(w6, x7);
+ const auto h = c_mul_neon(w7, x8);
+
+ const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b);
+ const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b);
+ const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b);
+ const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b);
+ const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b);
+ const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b);
+ const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
+ const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
+ const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
+ const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
+ const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
+ const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
+ const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
+ const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
+ const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
+ const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
+ const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
+ const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
+ const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+ const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
+ const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+ const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
+ const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+ const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
+ const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+
+ const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
+ const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
+ const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
+ const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
+ const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
+ const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
+ const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
+
+ const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
+ const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
+ const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
+ const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
+ const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
+ const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
+ const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
+
+ const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
+ const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
+ const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
+ const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
+ const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
+ const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
+ const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
+
+ x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
+ x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
+ x3 = reduce_sum_8(a, b1, c1, d1, e1, f1, g1, h1);
+ x4 = reduce_sum_8(a, b2, c2, d2, e2, f2, g2, h2);
+ x5 = reduce_sum_8(a, b3, c3, d3, e3, f3, g3, h3);
+ x6 = reduce_sum_8(a, b4, c4, d4, e4, f4, g4, h4);
+ x7 = reduce_sum_8(a, b5, c5, d5, e5, f5, g5, h5);
+ x8 = reduce_sum_8(a, b6, c6, d6, e6, f6, g6, h6);
+}
+
+template <bool first_stage>
+void fft_radix_2_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ auto a = float32x2_t{ 0, 0 };
+ auto b = float32x2_t{ 0, 0 };
+
+ // Load inputs
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ }
+
+ // Base-case prime transform
+ fft_2(a, b, w);
+
+ // Write outputs
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ }
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+void fft_radix_2_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ // Load inputs
+ float32x2_t a = wrapper::vload(x + M * k);
+ float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+
+ // Base-case prime transform
+ fft_2(a, b, w);
+
+ // Write outputs
+ wrapper::vstore(X + M * k, a);
+ wrapper::vstore(X + M * (k + 2 * Nx), b);
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_3_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const auto w2 = c_mul_neon(w, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ // Load inputs
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ }
+ c = wrapper::vload(x + k + 4 * Nx);
+
+ // Base-case prime transform
+ fft_3(a, b, c, w, w2);
+
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ }
+ wrapper::vstore(X + k + 4 * Nx, c);
+ }
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+void fft_radix_3_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const auto w2 = c_mul_neon(w, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ // Load inputs
+ float32x2_t a = wrapper::vload(x + M * k);
+ float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+ float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+
+ // Base-case prime transform
+ fft_3(a, b, c, w, w2);
+
+ // Store the output
+ wrapper::vstore(X + M * k, a);
+ wrapper::vstore(X + M * (k + 2 * Nx), b);
+ wrapper::vstore(X + M * (k + 4 * Nx), c);
+ }
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_4_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const auto w2 = c_mul_neon(w, w);
+ const auto w3 = c_mul_neon(w2, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ float32x2_t d = { 0, 0 };
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ c = wrapper::vgetlow(cd);
+ d = wrapper::vgethigh(cd);
+ }
+ else
+ {
+ // Load inputs
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ c = wrapper::vload(x + k + 4 * Nx);
+ d = wrapper::vload(x + k + 6 * Nx);
+ }
+
+ // Base-case prime transform
+ fft_4(a, b, c, d, w, w2, w3);
+
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ wrapper::vstore(X + k + 4 * Nx, c);
+ wrapper::vstore(X + k + 6 * Nx, d);
+ }
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+void fft_radix_4_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const auto w2 = c_mul_neon(w, w);
+ const auto w3 = c_mul_neon(w2, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ // Load inputs
+ float32x2_t a = wrapper::vload(x + M * k);
+ float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+ float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+ float32x2_t d = wrapper::vload(x + M * (k + 6 * Nx));
+
+ // Base-case prime transform
+ fft_4(a, b, c, d, w, w2, w3);
+
+ wrapper::vstore(X + M * k, a);
+ wrapper::vstore(X + M * (k + 2 * Nx), b);
+ wrapper::vstore(X + M * (k + 4 * Nx), c);
+ wrapper::vstore(X + M * (k + 6 * Nx), d);
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_5_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const float32x2_t w2 = c_mul_neon(w, w);
+ const float32x2_t w3 = c_mul_neon(w2, w);
+ const float32x2_t w4 = c_mul_neon(w3, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ float32x2_t d = { 0, 0 };
+ float32x2_t e = { 0, 0 };
+
+ // Load inputs
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ c = wrapper::vgetlow(cd);
+ d = wrapper::vgethigh(cd);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ c = wrapper::vload(x + k + 4 * Nx);
+ d = wrapper::vload(x + k + 6 * Nx);
+ }
+ e = wrapper::vload(x + k + 8 * Nx);
+
+ // Base-case prime transform
+ fft_5(a, b, c, d, e, w, w2, w3, w4);
+
+ // Store outputs
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ wrapper::vstore(X + k + 4 * Nx, c);
+ wrapper::vstore(X + k + 6 * Nx, d);
+ }
+ wrapper::vstore(X + k + 8 * Nx, e);
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+void fft_radix_5_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const float32x2_t w2 = c_mul_neon(w, w);
+ const float32x2_t w3 = c_mul_neon(w2, w);
+ const float32x2_t w4 = c_mul_neon(w3, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ // Load inputs
+ float32x2_t a = wrapper::vload(x + M * k);
+ float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+ float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+ float32x2_t d = wrapper::vload(x + M * (k + 6 * Nx));
+ float32x2_t e = wrapper::vload(x + M * (k + 8 * Nx));
+
+ // Base-case prime transform
+ fft_5(a, b, c, d, e, w, w2, w3, w4);
+
+ // Store outputs
+ wrapper::vstore(X + M * k, a);
+ wrapper::vstore(X + M * (k + 2 * Nx), b);
+ wrapper::vstore(X + M * (k + 4 * Nx), c);
+ wrapper::vstore(X + M * (k + 6 * Nx), d);
+ wrapper::vstore(X + M * (k + 8 * Nx), e);
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_7_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const float32x2_t w2 = c_mul_neon(w, w);
+ const float32x2_t w3 = c_mul_neon(w2, w);
+ const float32x2_t w4 = c_mul_neon(w3, w);
+ const float32x2_t w5 = c_mul_neon(w4, w);
+ const float32x2_t w6 = c_mul_neon(w5, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ float32x2_t d = { 0, 0 };
+ float32x2_t e = { 0, 0 };
+ float32x2_t f = { 0, 0 };
+ float32x2_t g = { 0, 0 };
+
+ // Load inputs
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+ const auto ef = wrapper::vloadq(x + k + 8 * Nx);
+
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ c = wrapper::vgetlow(cd);
+ d = wrapper::vgethigh(cd);
+ e = wrapper::vgetlow(ef);
+ f = wrapper::vgethigh(ef);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ c = wrapper::vload(x + k + 4 * Nx);
+ d = wrapper::vload(x + k + 6 * Nx);
+ e = wrapper::vload(x + k + 8 * Nx);
+ f = wrapper::vload(x + k + 10 * Nx);
+ }
+ g = wrapper::vload(x + k + 12 * Nx);
+
+ // Base-case prime transform
+ fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
+
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+ wrapper::vstore(X + k + 8 * Nx, wrapper::vcombine(e, f));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ wrapper::vstore(X + k + 4 * Nx, c);
+ wrapper::vstore(X + k + 6 * Nx, d);
+ wrapper::vstore(X + k + 8 * Nx, e);
+ wrapper::vstore(X + k + 10 * Nx, f);
+ }
+ wrapper::vstore(X + k + 12 * Nx, g);
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+void fft_radix_7_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const float32x2_t w2 = c_mul_neon(w, w);
+ const float32x2_t w3 = c_mul_neon(w2, w);
+ const float32x2_t w4 = c_mul_neon(w3, w);
+ const float32x2_t w5 = c_mul_neon(w4, w);
+ const float32x2_t w6 = c_mul_neon(w5, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ // Load inputs
+ float32x2_t a = wrapper::vload(x + M * k);
+ float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+ float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+ float32x2_t d = wrapper::vload(x + M * (k + 6 * Nx));
+ float32x2_t e = wrapper::vload(x + M * (k + 8 * Nx));
+ float32x2_t f = wrapper::vload(x + M * (k + 10 * Nx));
+ float32x2_t g = wrapper::vload(x + M * (k + 12 * Nx));
+
+ // Base-case prime transform
+ fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
+
+ // Store outputs
+ wrapper::vstore(X + M * k, a);
+ wrapper::vstore(X + M * (k + 2 * Nx), b);
+ wrapper::vstore(X + M * (k + 4 * Nx), c);
+ wrapper::vstore(X + M * (k + 6 * Nx), d);
+ wrapper::vstore(X + M * (k + 8 * Nx), e);
+ wrapper::vstore(X + M * (k + 10 * Nx), f);
+ wrapper::vstore(X + M * (k + 12 * Nx), g);
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+template <bool first_stage>
+void fft_radix_8_axes_0(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const float32x2_t w2 = c_mul_neon(w, w);
+ const float32x2_t w3 = c_mul_neon(w2, w);
+ const float32x2_t w4 = c_mul_neon(w3, w);
+ const float32x2_t w5 = c_mul_neon(w4, w);
+ const float32x2_t w6 = c_mul_neon(w5, w);
+ const float32x2_t w7 = c_mul_neon(w6, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ // Load inputs
+ float32x2_t a = { 0, 0 };
+ float32x2_t b = { 0, 0 };
+ float32x2_t c = { 0, 0 };
+ float32x2_t d = { 0, 0 };
+ float32x2_t e = { 0, 0 };
+ float32x2_t f = { 0, 0 };
+ float32x2_t g = { 0, 0 };
+ float32x2_t h = { 0, 0 };
+
+ // Base-case prime transform
+ if(first_stage)
+ {
+ const auto ab = wrapper::vloadq(x + k);
+ const auto cd = wrapper::vloadq(x + k + 4 * Nx);
+ const auto ef = wrapper::vloadq(x + k + 8 * Nx);
+ const auto gh = wrapper::vloadq(x + k + 12 * Nx);
+
+ a = wrapper::vgetlow(ab);
+ b = wrapper::vgethigh(ab);
+ c = wrapper::vgetlow(cd);
+ d = wrapper::vgethigh(cd);
+ e = wrapper::vgetlow(ef);
+ f = wrapper::vgethigh(ef);
+ g = wrapper::vgetlow(gh);
+ h = wrapper::vgethigh(gh);
+ }
+ else
+ {
+ a = wrapper::vload(x + k);
+ b = wrapper::vload(x + k + 2 * Nx);
+ c = wrapper::vload(x + k + 4 * Nx);
+ d = wrapper::vload(x + k + 6 * Nx);
+ e = wrapper::vload(x + k + 8 * Nx);
+ f = wrapper::vload(x + k + 10 * Nx);
+ g = wrapper::vload(x + k + 12 * Nx);
+ h = wrapper::vload(x + k + 14 * Nx);
+ }
+
+ // Apply twiddle factors
+ fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
+
+ // Store outputs
+ if(first_stage)
+ {
+ wrapper::vstore(X + k, wrapper::vcombine(a, b));
+ wrapper::vstore(X + k + 4 * Nx, wrapper::vcombine(c, d));
+ wrapper::vstore(X + k + 8 * Nx, wrapper::vcombine(e, f));
+ wrapper::vstore(X + k + 12 * Nx, wrapper::vcombine(g, h));
+ }
+ else
+ {
+ wrapper::vstore(X + k, a);
+ wrapper::vstore(X + k + 2 * Nx, b);
+ wrapper::vstore(X + k + 4 * Nx, c);
+ wrapper::vstore(X + k + 6 * Nx, d);
+ wrapper::vstore(X + k + 8 * Nx, e);
+ wrapper::vstore(X + k + 10 * Nx, f);
+ wrapper::vstore(X + k + 12 * Nx, g);
+ wrapper::vstore(X + k + 14 * Nx, h);
+ }
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+void fft_radix_8_axes_1(float *X, float *x, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int M, unsigned int N)
+{
+ float32x2_t w{ 1.0f, 0.0f };
+ for(unsigned int j = 0; j < Nx; j++)
+ {
+ const float32x2_t w2 = c_mul_neon(w, w);
+ const float32x2_t w3 = c_mul_neon(w2, w);
+ const float32x2_t w4 = c_mul_neon(w3, w);
+ const float32x2_t w5 = c_mul_neon(w4, w);
+ const float32x2_t w6 = c_mul_neon(w5, w);
+ const float32x2_t w7 = c_mul_neon(w6, w);
+
+ for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ {
+ // Load inputs
+ float32x2_t a = wrapper::vload(x + M * k);
+ float32x2_t b = wrapper::vload(x + M * (k + 2 * Nx));
+ float32x2_t c = wrapper::vload(x + M * (k + 4 * Nx));
+ float32x2_t d = wrapper::vload(x + M * (k + 6 * Nx));
+ float32x2_t e = wrapper::vload(x + M * (k + 8 * Nx));
+ float32x2_t f = wrapper::vload(x + M * (k + 10 * Nx));
+ float32x2_t g = wrapper::vload(x + M * (k + 12 * Nx));
+ float32x2_t h = wrapper::vload(x + M * (k + 14 * Nx));
+
+ // Base-case prime transform
+ fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
+
+ // Store outputs
+ wrapper::vstore(X + M * k, a);
+ wrapper::vstore(X + M * (k + 2 * Nx), b);
+ wrapper::vstore(X + M * (k + 4 * Nx), c);
+ wrapper::vstore(X + M * (k + 6 * Nx), d);
+ wrapper::vstore(X + M * (k + 8 * Nx), e);
+ wrapper::vstore(X + M * (k + 10 * Nx), f);
+ wrapper::vstore(X + M * (k + 12 * Nx), g);
+ wrapper::vstore(X + M * (k + 14 * Nx), h);
+ }
+
+ w = c_mul_neon(w, w_m);
+ }
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(config.axis > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(NEFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
+ ARM_COMPUTE_UNUSED(config);
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+ ARM_COMPUTE_UNUSED(config);
+
+ if(output != nullptr)
+ {
+ auto_init_if_empty(*output, *input);
+ }
+
+ Window win = calculate_max_window(*input, Steps());
+ if(output != nullptr)
+ {
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFFTRadixStageKernel::NEFFTRadixStageKernel()
+ : _input(nullptr), _output(nullptr), _run_in_place(false), _Nx(0), _axis(0), _radix(0), _func_0(), _func_1()
+{
+}
+
+void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo &config)
+{
+ // FFT table axis 0: [radix, first_stage]
+ static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
+
+ if(fft_table_axis0.empty())
+ {
+ fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;
+ fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;
+ fft_table_axis0[4][false] = &fft_radix_4_axes_0<false>;
+ fft_table_axis0[5][false] = &fft_radix_5_axes_0<false>;
+ fft_table_axis0[7][false] = &fft_radix_7_axes_0<false>;
+ fft_table_axis0[8][false] = &fft_radix_8_axes_0<false>;
+
+ fft_table_axis0[2][true] = &fft_radix_2_axes_0<true>;
+ fft_table_axis0[3][true] = &fft_radix_3_axes_0<true>;
+ fft_table_axis0[4][true] = &fft_radix_4_axes_0<true>;
+ fft_table_axis0[5][true] = &fft_radix_5_axes_0<true>;
+ fft_table_axis0[7][true] = &fft_radix_7_axes_0<true>;
+ fft_table_axis0[8][true] = &fft_radix_8_axes_0<true>;
+ }
+
+ _func_0 = fft_table_axis0[config.radix][config.is_first_stage];
+}
+
+void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo &config)
+{
+ // FFT table axis 1: [radix, first_stage]
+ static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
+
+ if(fft_table_axis1.empty())
+ {
+ fft_table_axis1[2] = &fft_radix_2_axes_1;
+ fft_table_axis1[3] = &fft_radix_3_axes_1;
+ fft_table_axis1[4] = &fft_radix_4_axes_1;
+ fft_table_axis1[5] = &fft_radix_5_axes_1;
+ fft_table_axis1[7] = &fft_radix_7_axes_1;
+ fft_table_axis1[8] = &fft_radix_8_axes_1;
+ }
+
+ _func_1 = fft_table_axis1[config.radix];
+}
+
+void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ // Output auto inizialitation if not yet initialized
+ if(output != nullptr)
+ {
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+ }
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+
+ _input = input;
+ _output = output;
+ _run_in_place = (output == nullptr) || (output == input);
+ _Nx = config.Nx;
+ _axis = config.axis;
+ _radix = config.radix;
+
+ switch(config.axis)
+ {
+ case 0:
+ set_radix_stage_axis0(config);
+ break;
+ case 1:
+ set_radix_stage_axis1(config);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
+ break;
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(), config);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+{
+ const bool run_in_place = (output == nullptr) || (output == input);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ (run_in_place) ? nullptr : output->clone().get(),
+ config)
+ .first);
+
+ return Status{};
+}
+
+std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
+{
+ return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+}
+
+void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_UNUSED(info);
+
+ Window input_window = window;
+ input_window.set(_axis, 0);
+
+ Iterator in(_input, input_window);
+ Iterator out(_run_in_place ? _input : _output, input_window);
+
+ // Precompute FFT constants
+ const unsigned int NxRadix = _radix * _Nx;
+ const float alpha = 2.0f * kPi / float(NxRadix);
+ const float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
+
+ if(_axis == 0)
+ {
+ const unsigned int N = _input->info()->dimension(0);
+ execute_window_loop(input_window, [&](const Coordinates &)
+ {
+ _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N);
+ },
+ in, out);
+ }
+ else
+ {
+ const unsigned int N = _input->info()->dimension(0);
+ const unsigned int M = _input->info()->dimension(1);
+ execute_window_loop(input_window, [&](const Coordinates &)
+ {
+ _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M);
+ },
+ in, out);
+ }
+
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
new file mode 100644
index 0000000..56703ba
--- /dev/null
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale)
+{
+ const auto a = wrapper::vload(c_in);
+ auto b = wrapper::vdiv(a, float32x2_t{ scale, scale });
+ if(is_conjugate)
+ {
+ const float img_part = wrapper::vgetlane(b, 1);
+ b = wrapper::vsetlane(-img_part, b, 1);
+ }
+
+ wrapper::vstore(c_out, b);
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps());
+
+ if(output != nullptr)
+ {
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
+
+ // NEFFTScaleKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+ }
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFFTScaleKernel::NEFFTScaleKernel()
+ : _input(nullptr), _output(nullptr), _scale(), _run_in_place(false), _is_conj(false)
+{
+}
+
+void NEFFTScaleKernel::configure(ITensor *input, ITensor *output, const FFTScaleKernelInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
+
+ _input = input;
+ _output = output;
+ _run_in_place = (output == nullptr) || (output == input);
+ _is_conj = config.conjugate;
+ _scale = config.scale;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), _run_in_place ? nullptr : output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEFFTScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config)
+{
+ ARM_COMPUTE_UNUSED(config);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+ return Status{};
+}
+
+void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_UNUSED(info);
+
+ Window input_window = window;
+ input_window.set(Window::DimX, 0);
+
+ Iterator in(_input, input_window);
+ Iterator out(_run_in_place ? _input : _output, input_window);
+
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale);
+ },
+ in, out);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
index 919efd2..81bcc8b 100644
--- a/src/core/NEON/kernels/NEFastCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,32 +49,30 @@
{
ARM_COMPUTE_ERROR_ON(k >= PERMUTATIONS);
- static const uint8_t permutations_table[PERMUTATIONS][PERM_SIZE]
- {
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
- { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
- { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
- { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
- { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
- { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
- { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
- { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
- { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
- { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
- { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
- { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
- { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
- { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
- { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
- { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
+ static const std::array<std::array<uint8_t, PERMUTATIONS>, PERM_SIZE> permutations_table{ { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
+ { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
+ { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
+ { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
+ { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
+ { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
+ { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
+ { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
+ { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
+ { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
+ { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
+ { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
+ { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
+ { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
+ { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
+ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
- };
+ } };
const uint8x8x2_t index =
{
{
- vld1_u8(permutations_table[k]),
- vld1_u8(permutations_table[k] + 8)
+ vld1_u8(permutations_table[k].data()),
+ vld1_u8(permutations_table[k].data() + 8)
}
};
@@ -112,7 +110,7 @@
. . 9 8 7 . . .
*/
- static const uint8_t top_right[8] =
+ static const std::array<uint8_t, 8> top_right =
{
/* The register r.val[0] will be used to retrieve these texels:
. . . 0 1 . . .
@@ -130,7 +128,7 @@
255
};
- static const uint8_t bottom_right[8] =
+ static const std::array<uint8_t, 8> bottom_right =
{
/* The register r.val[1] will be used to retrieve these texels:
. . . . . . 5 .
@@ -147,7 +145,7 @@
20 /* low table, third row, elem 5, value 7 in the diagram above*/
};
- static const uint8_t top_left[8] =
+ static const std::array<uint8_t, 8> top_left =
{
/* The register r.val[2] will be used to retrieve these texels:
. . F . . . . .
@@ -165,7 +163,7 @@
2 /* top table, first row, elem 3, value F in the diagram above*/
};
- static const uint8_t bottom_left[8] =
+ static const std::array<uint8_t, 8> bottom_left =
{
/* The register r.val[3] will be used to retrieve these texels:
B . . . . . . .
@@ -185,10 +183,10 @@
const uint8x8x4_t reg =
{
{
- vld1_u8(top_right),
- vld1_u8(bottom_right),
- vld1_u8(top_left),
- vld1_u8(bottom_left)
+ vld1_u8(top_right.data()),
+ vld1_u8(bottom_right.data()),
+ vld1_u8(top_left.data()),
+ vld1_u8(bottom_left.data())
}
};
@@ -268,7 +266,7 @@
return is_permutation_brighter(permutation, pg) || is_permutation_darker(permutation, pl);
}
-inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, uint8x8x2_t perm_indices[PERMUTATIONS])
+inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
{
/*
This function determines whether the point 'p' is a corner.
@@ -287,7 +285,7 @@
return corner_detected;
}
-inline uint8x8x2_t create_circle_tbl(const uint8_t *const __restrict buffer[7], size_t in_offset, const uint8x8x4_t &circle_index_r)
+inline uint8x8x2_t create_circle_tbl(const std::array<uint8_t *const __restrict, 7> &buffer, size_t in_offset, const uint8x8x4_t &circle_index_r)
{
/*
This function builds a LUT holding the 16 texels in the Brensenham circle radius 3.
@@ -329,7 +327,7 @@
return tbl_circle_texels;
}
-inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, uint8x8x2_t perm_indices[PERMUTATIONS])
+inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
{
uint8_t b = 255;
uint8_t a = tolerance;
@@ -411,7 +409,7 @@
Iterator in(_input, window);
Iterator out(_output, window);
- const uint8_t *const __restrict in_row[7] =
+ const std::array<uint8_t *const __restrict, 7> in_row
{
_input->ptr_to_element(Coordinates(-3, -3)),
_input->ptr_to_element(Coordinates(-3, -2)),
@@ -429,7 +427,7 @@
return p_is_in_ab && q_is_in_ab;
};
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const size_t in_offset = in.offset();
const uint8_t p0 = *in.ptr();
@@ -455,11 +453,11 @@
/* at this stage we use the full test with the 16 permutations to classify the point as corner or not */
const uint8x8x2_t tbl_circle_texel = create_circle_tbl(in_row, in_offset, circle_index_r);
- if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index.data()))
+ if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index))
{
if(_non_max_suppression)
{
- score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index.data());
+ score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index);
}
else
{
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index f4046e0..4127dc8 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -168,7 +168,7 @@
Iterator vertical_it(_tensor, vertical);
- execute_window_loop(vertical, [&](const Coordinates & id)
+ execute_window_loop(vertical, [&](const Coordinates &)
{
uint8_t *base_addr = start_valid_region + vertical_it.offset();
// Fill left and right borders
@@ -188,7 +188,7 @@
Iterator plane_it(_tensor, window);
// Iterate over all XY planes
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
uint8_t *base_addr = start_valid_region + plane_it.offset();
// Top border
@@ -224,7 +224,7 @@
Iterator vertical_it(_tensor, vertical);
- execute_window_loop(vertical, [&](const Coordinates & id)
+ execute_window_loop(vertical, [&](const Coordinates &)
{
uint8_t *base_addr = start_valid_region + vertical_it.offset();
// Fill left and right borders
@@ -244,7 +244,7 @@
Iterator plane_it(_tensor, window);
// Iterate over all XY planes
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
uint8_t *base_addr = start_valid_region + plane_it.offset();
// Top border
diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
index d1cff6f..50060b2 100644
--- a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,7 +111,7 @@
Iterator vertical_it(_tensor, vertical);
- execute_window_loop(vertical, [&](const Coordinates & id)
+ execute_window_loop(vertical, [&](const Coordinates &)
{
std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()), _border_size.left, constant_border_value);
std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()) + width - _border_size.right, _border_size.right, constant_border_value);
@@ -122,7 +122,7 @@
// All values are set at once
Iterator horizontal_it(_tensor, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
for(size_t i = 0; i < _border_size.top; ++i)
{
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index 6551d9e..43554a0 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,7 +111,7 @@
if(data_type == DataType::F32)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const float32x4_t res = vfloorq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())));
vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
@@ -121,7 +121,7 @@
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
else if(data_type == DataType::F16)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const float16x8_t res = vfloorq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())));
vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 7769d9e..c9299831 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -126,7 +126,7 @@
win_out.set_dimension_step(Window::DimX, 16);
Iterator out(output, win_out);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint16x4x4_t data =
{
@@ -154,7 +154,7 @@
win_out.set_dimension_step(Window::DimX, 16);
Iterator out(output, win_out);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint32x4x4_t data =
{
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index a100cd2..b561d1e 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -479,7 +479,7 @@
void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8_t *mtx_a0 = ina.ptr();
const uint8_t *mtx_b0 = inb.ptr();
@@ -599,7 +599,7 @@
// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
// All the values needed for computing a single 4x4 block will be read from consecutive memory positions
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index 33a5b4a..2293926 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -106,20 +106,17 @@
Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win,
- mm_result_access);
+ window_changed = window_changed || update_window_and_padding(win, mm_result_access);
if(a_offset != 0)
{
AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win,
- vector_sum_col_access);
+ window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
}
if(b_offset != 0)
{
AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
- window_changed = window_changed || update_window_and_padding(win,
- vector_sum_row_access);
+ window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
}
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
new file mode 100644
index 0000000..46e53ce
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+
+namespace arm_compute
+{
+class Coordinates;
+
+namespace
+{
+inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
+{
+ return
+ {
+ {
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)
+ }
+ };
+}
+
+inline int32x4x4_t load(const int32_t *ptr, int32_t x)
+{
+ return
+ {
+ {
+ vld1q_s32(ptr + x + 0),
+ vld1q_s32(ptr + x + 4),
+ vld1q_s32(ptr + x + 8),
+ vld1q_s32(ptr + x + 12)
+ }
+ };
+}
+
+inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
+{
+ int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x);
+
+ a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+ a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+ a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+ a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+ return a_offset_term_s32;
+}
+
+inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offset)
+{
+ int32x4_t b_offset_term_s32 = vld1q_dup_s32(vector_sum_row_ptr);
+ b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset);
+ return b_offset_term_s32;
+}
+
+inline int32x4x4_t get_k_offset(int32_t k_offset)
+{
+ return
+ {
+ {
+ vdupq_n_s32(k_offset),
+ vdupq_n_s32(k_offset),
+ vdupq_n_s32(k_offset),
+ vdupq_n_s32(k_offset)
+ }
+ };
+}
+
+template <bool is_bounded_relu>
+inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
+{
+ const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+ // Shift final result (negative value shift right)
+ in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+ in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+ in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+ in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+ // Saturate negative values
+ in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+ in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+ in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+ in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+ // Convert S32 to S16
+ const int16x8x2_t in_s16 =
+ {
+ {
+ vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+ }
+ };
+
+ // Convert S16 to U8
+ uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+ if(is_bounded_relu)
+ {
+ out_u8 = vmaxq_u8(out_u8, min_u8);
+ out_u8 = vminq_u8(out_u8, max_u8);
+ }
+
+ return out_u8;
+}
+
+inline Window get_win_vector_sum(const Window &window)
+{
+ Window win_vector_sum(window);
+ win_vector_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ return win_vector_sum;
+}
+
+inline Iterator get_vector_sum_col_it(const Window &window, const ITensor *vector_sum_col)
+{
+ Iterator vector_sum_col_it(vector_sum_col, get_win_vector_sum(window));
+ return vector_sum_col_it;
+}
+
+inline Iterator get_vector_sum_row_it(const Window &window, const ITensor *vector_sum_row)
+{
+ Window win_vector_sum_row = get_win_vector_sum(window);
+ win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+ return vector_sum_row_it;
+}
+
+inline Iterator get_bias_it(const Window &window, const ITensor *bias)
+{
+ Window win_bias(window);
+ win_bias.set(Window::DimY, Window::Dimension(0, 1, 1));
+ win_bias.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ Iterator bias_it(bias, win_bias);
+ return bias_it;
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
+{
+ return
+ {
+ {
+ vaddq_s32(a.val[0], b),
+ vaddq_s32(a.val[1], b),
+ vaddq_s32(a.val[2], b),
+ vaddq_s32(a.val[3], b)
+ }
+ };
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
+{
+ return
+ {
+ {
+ vaddq_s32(a.val[0], b.val[0]),
+ vaddq_s32(a.val[1], b.val[1]),
+ vaddq_s32(a.val[2], b.val[2]),
+ vaddq_s32(a.val[3], b.val[3])
+ }
+ };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
+{
+ return
+ {
+ {
+ vmulq_n_s32(a.val[0], mul_scalar),
+ vmulq_n_s32(a.val[1], mul_scalar),
+ vmulq_n_s32(a.val[2], mul_scalar),
+ vmulq_n_s32(a.val[3], mul_scalar)
+ }
+ };
+}
+
+template <bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
+ const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8,
+ int32_t a_offset, int32_t b_offset, int32_t k_offset,
+ GEMMLowpOutputStageInfo output_stage, int window_step_x, int window_start_x, int window_end_x)
+{
+ int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
+ if(!is_fixed_point)
+ {
+ // Combine quantization offset with other offsets.
+ offset_term_s32 = add_s32(offset_term_s32, result_offset_s32);
+ }
+ if(has_a_offset && has_b_offset)
+ {
+ offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset));
+ }
+ if(has_b_offset)
+ {
+ offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset));
+ }
+
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
+
+ if(has_a_offset)
+ {
+ in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
+ }
+ if(has_bias)
+ {
+ in_s32 = add_s32(in_s32, load(bias_ptr, x));
+ }
+ if(!is_fixed_point || has_b_offset)
+ {
+ in_s32 = add_s32(in_s32, offset_term_s32);
+ }
+ if(!is_fixed_point)
+ {
+ in_s32 = mul_s32(in_s32, output_stage.gemmlowp_multiplier);
+ }
+
+ if(is_fixed_point)
+ {
+ vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, result_offset_s32, min_u8, max_u8));
+ }
+ else
+ {
+ vst1q_u8(out_it.ptr() + x, finalize_quantization_floating_point<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+ }
+ }
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+
+ if(has_a_offset)
+ {
+ in_value += (*(vector_sum_col_ptr + x) * a_offset);
+ }
+ if(has_bias)
+ {
+ in_value += *(bias_ptr + x);
+ }
+
+ if(is_fixed_point)
+ {
+ // Finalize and store the result
+ *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift,
+ output_stage.gemmlowp_offset, static_cast<uint8_t>(output_stage.gemmlowp_min_bound), static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+ }
+ else
+ {
+ // Finalize quantization
+ in_value = (in_value * output_stage.gemmlowp_multiplier) >> output_stage.gemmlowp_shift;
+
+ // Bound and store the result
+ if(is_bounded_relu)
+ {
+ in_value = static_cast<uint8_t>(std::max<int32_t>(output_stage.gemmlowp_min_bound, std::min<int32_t>(output_stage.gemmlowp_max_bound, in_value)));
+ }
+ *(out_it.ptr() + x) = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
+ }
+ }
+}
+
+template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+void run_offset_contribution_output_stage(const Window &window,
+ const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+ int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
+ GEMMLowpOutputStageInfo output_stage)
+{
+ const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
+ const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+ const int32x4_t result_offset_s32 = vdupq_n_s32(output_stage.gemmlowp_offset);
+ const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? output_stage.gemmlowp_shift : -output_stage.gemmlowp_shift);
+ const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_min_bound));
+ const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ Window win(window);
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
+
+ Iterator mm_result_it(mm_result, win);
+ Iterator out_it(output, win);
+
+ if((a_offset != 0) && (b_offset != 0))
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+ Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+ Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
+
+ const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+ // Offset in case vector_sum_col is batched
+ const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+ if(bias != nullptr)
+ {
+ Iterator bias_it = get_bias_it(collapsed_window, bias);
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+ + id.y() + (id.z() % depth_input) * height_input;
+ run_offset_contribution_output_stage_window<true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+ out_it,
+ result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+ output_stage, window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
+ }
+ else
+ {
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+ + id.y() + (id.z() % depth_input) * height_input;
+ run_offset_contribution_output_stage_window<true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+ result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+ output_stage, window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
+ }
+ }
+ else if((a_offset == 0) && (b_offset != 0))
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+ Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
+
+ const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+ if(bias != nullptr)
+ {
+ Iterator bias_it = get_bias_it(collapsed_window, bias);
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+ + id.y() + (id.z() % depth_input) * height_input;
+ run_offset_contribution_output_stage_window<false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+ output_stage, window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_row_it, bias_it, mm_result_it, out_it);
+ }
+ else
+ {
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
+ + id.y() + (id.z() % depth_input) * height_input;
+ run_offset_contribution_output_stage_window<false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+ result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+ output_stage, window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_row_it, mm_result_it, out_it);
+ }
+ }
+ else if((a_offset != 0) && (b_offset == 0))
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+
+ Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+
+ // Offset in case vector_sum_col is batched
+ const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+ if(bias != nullptr)
+ {
+ Iterator bias_it = get_bias_it(collapsed_window, bias);
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ run_offset_contribution_output_stage_window<true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+ output_stage, window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_col_it, bias_it, mm_result_it, out_it);
+ }
+ else
+ {
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ run_offset_contribution_output_stage_window<true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
+ result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+ output_stage, window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_col_it, mm_result_it, out_it);
+ }
+ }
+ else
+ {
+ if(bias != nullptr)
+ {
+ Iterator bias_it = get_bias_it(collapsed_window, bias);
+ execute_window_loop(collapsed_window, [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window<false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+ output_stage, window_step_x, window_start_x, window_end_x);
+ },
+ bias_it, mm_result_it, out_it);
+ }
+ else
+ {
+ execute_window_loop(collapsed_window, [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window<false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
+ result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
+ output_stage, window_step_x, window_start_x, window_end_x);
+ },
+ mm_result_it, out_it);
+ }
+ return;
+ }
+}
+
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
+ int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+ }
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if(a_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+ }
+
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ if(b_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+ // Validate input
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+ TensorShape output_shape = output->tensor_shape();
+ if(output_shape.num_dimensions() > 1)
+ {
+ const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+ TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+ vector_sum_row_shape.collapse_from(1);
+ output_shape.collapse_from(output_batch_idx);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+ "mm_result tensor must have the same number of batches of output tensor");
+
+ if(a_offset != 0)
+ {
+ TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+ vector_sum_col_shape.collapse_from(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+ }
+ }
+ }
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *output)
+{
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, mm_result->clone()->set_data_type(DataType::QASYMM8));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*mm_result, Steps());
+
+ // Note: This kernel performs 16 elements per iteration.
+ // However, since we use a left-over for loop, we cannot have any read or write out of memory
+ // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+
+NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
+get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, GEMMLowpOutputStageInfo output_stage)
+{
+ static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function =
+ {
+ { 0, &run_offset_contribution_output_stage<false, false, false> },
+ { 1, &run_offset_contribution_output_stage<true, false, false> },
+ { 2, &run_offset_contribution_output_stage<false, true, false> },
+ { 3, &run_offset_contribution_output_stage<true, true, false> },
+ { 4, &run_offset_contribution_output_stage<false, false, true> },
+ { 5, &run_offset_contribution_output_stage<true, false, true> },
+ { 6, &run_offset_contribution_output_stage<false, true, true> },
+ { 7, &run_offset_contribution_output_stage<true, true, true> }
+ };
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = vector_sum_row != nullptr
+ && mm_result->info()->num_dimensions() > 1
+ && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+
+ // Check if we need to clamp the result using min and max
+ const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound)
+ && !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255));
+
+ const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
+
+ // key acts as a bitset, setting the first bit on reinterpret_as_3d,
+ // the second on is_bounded_relu, and the third on is_fixed_point.
+ uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
+ return map_function.find(key)->second;
+}
+} // namespace
+
+NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageKernel()
+ : _function(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
+ _output_stage(GEMMLowpOutputStageInfo())
+
+{
+}
+
+void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col,
+ const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k,
+ int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+ vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
+ vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
+ bias != nullptr ? bias->info() : nullptr, // NOLINT
+ output->info(), a_offset, b_offset, output_stage)); // NOLINT
+
+ _vector_sum_col = vector_sum_col;
+ _vector_sum_row = vector_sum_row;
+ _bias = bias;
+ _mm_result = mm_result;
+ _output = output;
+ _a_offset = a_offset;
+ _b_offset = b_offset;
+ _k_offset = a_offset * b_offset * k;
+ _output_stage = output_stage;
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if(a_offset != 0)
+ {
+ // Check if vector_sum_col_shape should be slidden or not
+ // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1;
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(mm_result->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+
+ _function = get_configured_function(mm_result, vector_sum_row, output_stage);
+}
+
+Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
+ int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void NEGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ _function(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage);
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index f0ac695..4906e6a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -86,37 +86,6 @@
namespace arm_compute
{
class Coordinates;
-
-/* Function used by the left-over for loop to perform the quantization */
-template <bool is_bounded_relu>
-inline uint8_t finalize_quantization(int32x4_t in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8_t min_u8, uint8_t max_u8)
-{
- const static int32x4_t zero_s32 = vdupq_n_s32(0);
- const static int32x4_t sat_value_s32 = vdupq_n_s32(255);
-
- // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
- in_s32 = vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier);
-
- // Round to the nearest division by a power-of-two using result_shift_s32
- in_s32 = rounding_divide_by_pow2(in_s32, result_shift);
-
- // Add the offset terms
- in_s32 = vaddq_s32(in_s32, result_offset_after_shift_s32);
-
- // Saturate negative values
- in_s32 = vmaxq_s32(in_s32, zero_s32);
- in_s32 = vminq_s32(in_s32, sat_value_s32);
-
- auto out_u8 = static_cast<uint8_t>(vgetq_lane_s32(in_s32, 0));
-
- if(is_bounded_relu)
- {
- out_u8 = std::max(out_u8, min_u8);
- out_u8 = std::min(out_u8, max_u8);
- }
-
- return out_u8;
-}
} // namespace arm_compute
template <bool is_bounded_relu>
@@ -145,7 +114,7 @@
win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
Iterator bias(_bias, win_biases);
- execute_window_loop(win_collapsed, [&](const Coordinates & id)
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
{
// Compute 16 elements per iteration
int x = window_start_x;
@@ -188,17 +157,15 @@
// Add bias
in_value += bias_value;
-
// Finalize and store the result
- *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min),
- static_cast<uint8_t>(_max));
+ *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
}
},
in, out, bias);
}
else
{
- execute_window_loop(win_collapsed, [&](const Coordinates & id)
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
{
// Compute 16 elements per iteration
int x = window_start_x;
@@ -220,10 +187,10 @@
// Compute left-over elements
for(; x < window_end_x; ++x)
{
- const int32x4_t in_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+ const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
// Finalize and store the result
- *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
+ *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
}
},
in, out);
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index 573373f..a221bd7 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -183,7 +183,7 @@
win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
Iterator bias(_bias, win_biases);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
// Compute 16 elements per iteration
int x = window_start_x;
@@ -245,7 +245,7 @@
}
else
{
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
// Compute 16 elements per iteration
int x = window_start_x;
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index 42353ed..5ac2323 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -122,7 +122,7 @@
{
case DataType::F32:
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const float32x4x4_t accum = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
@@ -144,7 +144,7 @@
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const float16x8x2_t accum = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 757dbbc..86bea84 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -60,7 +60,7 @@
Iterator in(input, window);
Iterator out(output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
const auto out_ptr = reinterpret_cast<float *>(out.ptr());
@@ -87,7 +87,7 @@
Iterator in(input, window);
Iterator out(output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index f182fb2..a82fae7 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -388,7 +388,7 @@
// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
// All the values needed for computing a single 4x4 block will be read from consecutive memory positions
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
@@ -687,7 +687,7 @@
const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const auto *mtx_a0 = reinterpret_cast<const float16_t *>(ina.ptr());
const auto *mtx_b0 = reinterpret_cast<const float16_t *>(inb.ptr());
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
index 048c229..f412980 100644
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,7 +80,7 @@
static const int16x8_t two = vdupq_n_s16(2);
static const int16x8_t four = vdupq_n_s16(4);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
index b62e281..0e4549e 100644
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,7 +88,7 @@
static const int16x8_t six = vdupq_n_s16(6);
static const int16x8_t four = vdupq_n_s16(4);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
uint8x16_t data = vld1q_u8(input.ptr());
@@ -112,7 +112,7 @@
BorderSize NEGaussian5x5VertKernel::border_size() const
{
- return BorderSize(2, 0);
+ return BorderSize{ 2, 0 };
}
void NEGaussian5x5VertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
@@ -159,7 +159,7 @@
const uint16x8_t six = vdupq_n_u16(6);
const uint16x8_t four = vdupq_n_u16(4);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const size_t input_offset_high_s16 = input.offset();
const size_t input_offset_low_s16 = input.offset() + 16;
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
index 7a123e2..13cee19 100644
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,7 +47,7 @@
BorderSize NEGaussianPyramidHorKernel::border_size() const
{
- return BorderSize(0, 2);
+ return BorderSize{ 0, 2 };
}
void NEGaussianPyramidHorKernel::configure(const ITensor *input, ITensor *output)
@@ -126,7 +126,7 @@
Iterator out(_output, win_out);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16x2_t data_2q = vld2q_u8(in.ptr());
const uint8x16_t &data_even = data_2q.val[0];
@@ -155,7 +155,7 @@
BorderSize NEGaussianPyramidVertKernel::border_size() const
{
- return BorderSize(2, 0);
+ return BorderSize{ 2, 0 };
}
void NEGaussianPyramidVertKernel::configure(const ITensor *input, ITensor *output)
@@ -236,7 +236,7 @@
const uint8_t *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 3));
const uint8_t *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 4));
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
// Low data
const uint16x8_t data_low_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index c204395..c58b1c0 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -695,7 +695,7 @@
Iterator phase(_input_phase, win_phase);
Iterator out(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const auto mag_row_ptr = reinterpret_cast<const int16_t *>(mag.ptr());
const auto phase_row_ptr = reinterpret_cast<const uint8_t *>(phase.ptr());
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 61221c1..34e68e7 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -719,7 +719,7 @@
const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
(*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
},
diff --git a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
new file mode 100644
index 0000000..b8e204c
--- /dev/null
+++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ const unsigned int num_elems_processed_per_iteration = 16 / output->element_size();
+
+ // The window needs to be based on input as we copy all the widths of input
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+ DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16, DataType::F16,
+ DataType::U32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) + height_offset > output->dimension(Window::DimY));
+ for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+ }
+
+ return Status{};
+}
+} // namespace
+
+NEHeightConcatenateLayerKernel::NEHeightConcatenateLayerKernel()
+ : _input(nullptr), _output(nullptr), _height_offset(0)
+{
+}
+
+void NEHeightConcatenateLayerKernel::configure(const ITensor *input, unsigned int height_offset, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), height_offset, output->info()));
+
+ _input = input;
+ _output = output;
+ _height_offset = height_offset;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void NEHeightConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ // Offset output pointer to the correct position
+ uint8_t *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + _height_offset * _output->info()->strides_in_bytes()[Window::DimY];
+
+ // Create iterators
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+ const DataType dt = _input->info()->data_type();
+ const QuantizationInfo &input_qinfo = _input->info()->quantization_info();
+ const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+ if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo)
+ {
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ vst1q_u8(output_ptr + output.offset(), vquantize(vdequantize(vld1q_u8(input.ptr()), input_qinfo), output_qinfo));
+ },
+ input, output);
+ }
+ else
+ {
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ const auto in_ptr = input.ptr();
+ const auto out_ptr = output_ptr + output.offset();
+
+ wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr));
+ },
+ input, output);
+ }
+}
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 2e3d9de..34af0cf 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -208,32 +208,48 @@
const int end_x = start_x + kernel_width * dilation_x;
const int end_y = start_y + kernel_height * dilation_y;
const int pad_quant = kernel_width * input_c;
-
- for(int y = start_y; y < end_y; y += dilation_y)
+ if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1))
{
- if(y < 0 || y >= input_h)
+ for(int y = start_y; y < end_y; y += dilation_y)
{
- memset(out_ptr, pad_value, pad_quant * sizeof(T));
- out_ptr += pad_quant;
+ //optimized for no dilation and no boundary pixels
+ memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * sizeof(T));
+ out_ptr += input_c * kernel_width;
}
- else
+ }
+ else
+ {
+ for(int y = start_y; y < end_y; y += dilation_y)
{
- for(int x = start_x; x < end_x; x += dilation_x)
+ if(y < 0 || y >= input_h)
{
- if(x < 0 || x >= input_w)
+ memset(out_ptr, pad_value, pad_quant * sizeof(T));
+ out_ptr += pad_quant;
+ }
+ else if(dilation_x > 1 || start_x < 0 || end_x >= input_w)
+ {
+ for(int x = start_x; x < end_x; x += dilation_x)
{
- memset(out_ptr, pad_value, input_c * sizeof(T));
- out_ptr += input_c;
+ if(x < 0 || x >= input_w)
+ {
+ memset(out_ptr, pad_value, input_c * sizeof(T));
+ out_ptr += input_c;
+ }
+ else
+ {
+ memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T));
+ out_ptr += input_c;
+ }
}
- else
- {
- memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T));
- out_ptr += input_c;
- }
+ }
+ else
+ {
+ //optimized for no dilation and no boundary pixels
+ memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * sizeof(T));
+ out_ptr += input_c * kernel_width;
}
}
}
-
// Append 1 if the convolution layer has biases
if(has_bias)
{
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index 16a3cf7..b6db5f0 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,7 +63,7 @@
BorderSize NEIntegralImageKernel::border_size() const
{
- return BorderSize(1, 0, 0, 1);
+ return BorderSize{ 1, 0, 0, 1 };
}
bool NEIntegralImageKernel::is_parallelisable() const
@@ -83,7 +83,7 @@
const auto output_top_left = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(-1, -1)));
const auto output_top_mid = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(0, -1)));
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t input_pixels = vld1q_u8(input.ptr());
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index cda041d..efdcc44 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,7 @@
const auto sum_value = *reinterpret_cast<const T *>(sum_it.ptr());
const auto vec_normalize_value = wrapper::vdup_n(static_cast<T>(1.f / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)))), ExactTagType{});
- execute_window_loop(in_slice, [&](const Coordinates & id)
+ execute_window_loop(in_slice, [&](const Coordinates &)
{
const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
@@ -93,7 +93,7 @@
auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
- execute_window_loop(in_slice, [&](const Coordinates & id)
+ execute_window_loop(in_slice, [&](const Coordinates &)
{
const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
@@ -127,7 +127,7 @@
auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
- execute_window_loop(in_slice, [&](const Coordinates & id)
+ execute_window_loop(in_slice, [&](const Coordinates &)
{
const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 83593e7..ddf869e 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -405,9 +405,9 @@
init_keypoints(list_start, list_end);
- const int buffer_size = _window_dimension * _window_dimension;
- int32_t bilinear_ix[buffer_size];
- int32_t bilinear_iy[buffer_size];
+ const int buffer_size = _window_dimension * _window_dimension;
+ std::vector<int32_t> bilinear_ix(buffer_size);
+ std::vector<int32_t> bilinear_iy(buffer_size);
const int half_window = _window_dimension / 2;
@@ -444,7 +444,7 @@
int iA12 = 0;
int iA22 = 0;
- std::tie(iA11, iA12, iA22) = compute_spatial_gradient_matrix(old_keypoint, bilinear_ix, bilinear_iy);
+ std::tie(iA11, iA12, iA22) = compute_spatial_gradient_matrix(old_keypoint, bilinear_ix.data(), bilinear_iy.data());
const float A11 = iA11 * FLT_SCALE;
const float A12 = iA12 * FLT_SCALE;
@@ -490,7 +490,7 @@
int ib1 = 0;
int ib2 = 0;
- std::tie(ib1, ib2) = compute_image_mismatch_vector(old_keypoint, new_keypoint, bilinear_ix, bilinear_iy);
+ std::tie(ib1, ib2) = compute_image_mismatch_vector(old_keypoint, new_keypoint, bilinear_ix.data(), bilinear_iy.data());
double b1 = ib1 * FLT_SCALE;
double b2 = ib2 * FLT_SCALE;
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index 4a318f0..8c09898 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -323,7 +323,7 @@
Iterator gy(_gy, window);
Iterator magnitude(_magnitude, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const int16x8x2_t input1 =
{
@@ -369,7 +369,7 @@
Iterator gy(_gy, window);
Iterator phase(_phase, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const int16x8x2_t input1 =
{
@@ -415,7 +415,7 @@
Iterator magnitude(_magnitude, window);
Iterator phase(_phase, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const int16x8x2_t input1 =
{
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
index 7895b00..0af6305 100644
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,7 +51,7 @@
uint64x1_t sum_squared = vdup_n_u64(0);
// Calculate sum
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t in_data = vld1q_u8(iterator.ptr());
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
index 5bcdc7b..9dc1bc9 100644
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,7 +87,7 @@
Iterator input(_input, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NEMemsetKernel.cpp b/src/core/NEON/kernels/NEMemsetKernel.cpp
index 2b57b15..a0fab99 100644
--- a/src/core/NEON/kernels/NEMemsetKernel.cpp
+++ b/src/core/NEON/kernels/NEMemsetKernel.cpp
@@ -67,7 +67,7 @@
collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator tensor_it(_tensor, collapsed);
- execute_window_loop(collapsed, [&](const Coordinates & id)
+ execute_window_loop(collapsed, [&](const Coordinates &)
{
uint8_t *base_addr = start_valid_region + tensor_it.offset();
// Set memory
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index 5d1b4b3..fe3af0b 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -144,7 +144,7 @@
float carry_min_scalar = std::numeric_limits<float>::max();
float carry_max_scalar = std::numeric_limits<float>::lowest();
- execute_window_loop(window_input, [&](const Coordinates & id)
+ execute_window_loop(window_input, [&](const Coordinates &)
{
int x = x_start;
const auto in_ptr = reinterpret_cast<const float *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
@@ -203,7 +203,7 @@
Iterator output(_output, window_output);
- execute_window_loop(window_output, [&](const Coordinates & id)
+ execute_window_loop(window_output, [&](const Coordinates &)
{
vst1_f32(reinterpret_cast<float *>(output.ptr()), reset_values);
},
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index befece2..08b27e3 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -152,7 +152,7 @@
Iterator input(_input, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int x = x_start;
@@ -209,7 +209,7 @@
Iterator input(_input, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int x = x_start;
const auto in_ptr = reinterpret_cast<const int16_t *>(input.ptr());
@@ -268,7 +268,7 @@
Iterator input(_input, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
int x = x_start;
const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
@@ -323,11 +323,11 @@
template <class T, std::size_t... N>
struct NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>
{
- static const NEMinMaxLocationKernel::MinMaxLocFunction func_table[sizeof...(N)];
+ static const std::array<NEMinMaxLocationKernel::MinMaxLocFunction, sizeof...(N)> func_table;
};
template <class T, std::size_t... N>
-const NEMinMaxLocationKernel::MinMaxLocFunction NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>::func_table[sizeof...(N)] =
+const std::array<NEMinMaxLocationKernel::MinMaxLocFunction, sizeof...(N)> NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>::func_table
{
&NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
};
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index 52dbe26..00536f0 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -117,7 +117,7 @@
sort(p4, p2);
}
-inline void sort21(uint8x8_t p[21])
+inline void sort21(std::array<uint8x8_t, 21> &p)
{
sort(p[0], p[1]);
sort(p[2], p[3]);
@@ -222,7 +222,7 @@
sort(p[10], p[16]);
}
-inline void sort25(uint8x8_t p[25])
+inline void sort25(std::array<uint8x8_t, 25> &p)
{
sort(p[1], p[2]);
sort(p[0], p[1]);
@@ -429,7 +429,7 @@
const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 1)));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -463,7 +463,7 @@
const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
@@ -471,7 +471,7 @@
const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
- const uint8x8_t d[] =
+ const std::array<uint8x8_t, 10> d =
{
vget_low_u8(top2_data),
vget_high_u8(top2_data),
@@ -485,7 +485,7 @@
vget_high_u8(bot2_data)
};
- uint8x8_t p[25];
+ std::array<uint8x8_t, 25> p{ 0 };
for(unsigned int i = 0; i < 5; ++i)
{
const unsigned int idx_d = i * 2;
@@ -524,7 +524,7 @@
input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
}
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
// Get min of rows
uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.offset());
@@ -563,7 +563,7 @@
input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
}
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset());
@@ -593,7 +593,7 @@
const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset());
const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -624,7 +624,7 @@
const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 2)));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const uint8x8_t top2_data = vld1_u8(input_top2_ptr + input.offset());
const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset());
@@ -671,7 +671,7 @@
input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
}
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset());
@@ -717,7 +717,7 @@
input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
}
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset());
@@ -754,7 +754,7 @@
const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
@@ -762,7 +762,7 @@
const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
- uint8x8_t d[] =
+ std::array<uint8x8_t, 10> d =
{
vget_low_u8(top2_data),
vget_high_u8(top2_data),
@@ -776,7 +776,7 @@
vget_high_u8(bot2_data)
};
- uint8x8_t p[21];
+ std::array<uint8x8_t, 21> p{ 0 };
p[0] = d[0];
p[1] = vext_u8(d[0], d[1], 1);
p[2] = vext_u8(d[0], d[1], 2);
@@ -816,7 +816,7 @@
const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
@@ -849,7 +849,7 @@
const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
@@ -889,7 +889,7 @@
std::array<uint8_t, mask_size> vals{ {} };
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
// Clear array
std::fill(std::begin(vals), std::end(vals), 0);
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
index 8f97e6a..674a7c8 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -506,7 +506,7 @@
const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type());
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
_func(input.ptr(), output.ptr(), input_stride);
},
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index a4f5143..fa16484 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,8 +28,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include <arm_neon.h>
@@ -42,12 +45,9 @@
#include <arm_fp16.h> // needed for float16_t
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-using namespace arm_compute;
-
namespace arm_compute
{
class Coordinates;
-} // namespace arm_compute
namespace
{
@@ -63,15 +63,29 @@
ARM_COMPUTE_UNUSED(rounding_policy);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::QASYMM8 && input2->data_type() != DataType::QASYMM8,
+ "Input2 must be QASYMM8 if both input1 is QASYMM8");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::QASYMM8 && input2->data_type() == DataType::QASYMM8 && overflow_policy == ConvertPolicy::WRAP,
+ "ConvertPolicy cannot be WRAP if datatype is QASYMM8");
+
+ if(output->total_size() > 0)
+ {
+ if(output->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+ }
+
+ const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+ }
if(std::abs(scale - scale255_constant) < 0.00001f)
{
@@ -159,6 +173,34 @@
return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
}
+void mul_saturate_QASYMM8_QASYMM8_QASYMM8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale,
+ const QuantizationInfo &input1_qua_info, const QuantizationInfo &input2_qua_info, const QuantizationInfo &output_qua_info)
+{
+ const auto input1 = static_cast<const qasymm8_t *__restrict>(input1_ptr);
+ const auto input2 = static_cast<const qasymm8_t *__restrict>(input2_ptr);
+ const auto output = static_cast<qasymm8_t *__restrict>(output_ptr);
+
+ const qasymm8x16_t input1_q = vld1q_u8(input1);
+ const qasymm8x16_t input2_q = vld1q_u8(input2);
+
+ // Dequantitize inputs
+ const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+ const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+ const QuantizationInfo tmp_qua_info = QuantizationInfo(output_qua_info.scale / scale, output_qua_info.offset);
+
+ const float32x4x4_t out_f32x4x4 =
+ {
+ vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+ vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+ vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+ vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3])
+ };
+
+ const uint8x16_t result = vquantize(out_f32x4x4, tmp_qua_info);
+ vst1q_u8(output, result);
+}
+
template <bool is_scale255, bool is_sat>
void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
{
@@ -291,7 +333,6 @@
vst2q_s16(output, result);
}
-template <bool is_scale255, bool is_sat>
void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
{
const auto input1 = static_cast<const float *__restrict>(input1_ptr);
@@ -313,7 +354,35 @@
vst4q_f32(output, result);
}
-template <bool is_scale255, bool is_sat>
+void c_mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr)
+{
+ const auto input1 = static_cast<const float *__restrict>(input1_ptr);
+ const auto input2 = static_cast<const float *__restrict>(input2_ptr);
+ const auto output = static_cast<float *__restrict>(output_ptr);
+
+ const float32x4_t a = wrapper::vloadq(input1);
+ float32x4_t b = wrapper::vloadq(input2);
+
+ using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
+
+ const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
+ const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+ const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+ const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+ const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+
+ const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+ const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+
+ float32x4_t res = wrapper::vmul(tmp0, b);
+
+ b = wrapper::vrev64(b);
+ b = wrapper::vmul(b, mask);
+
+ res = wrapper::vmla(res, tmp1, b);
+ wrapper::vstore(output, res);
+}
+
void mul_F16_F16_F16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -419,7 +488,7 @@
} // namespace
NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
- : _func_float(nullptr), _func_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+ : _func_float(nullptr), _func_int(nullptr), _func_qasymm8(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
{
}
@@ -439,6 +508,7 @@
_output = output;
_scale = scale;
_scale_exponent = 0;
+ _func_qasymm8 = nullptr;
_func_int = nullptr;
_func_float = nullptr;
@@ -464,7 +534,11 @@
const DataType dt_output = output->info()->data_type();
const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
- if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+ if(dt_input1 == DataType::QASYMM8 && dt_input2 == DataType::QASYMM8)
+ {
+ _func_qasymm8 = &mul_saturate_QASYMM8_QASYMM8_QASYMM8_n;
+ }
+ else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
{
if(is_scale_255)
{
@@ -521,12 +595,12 @@
}
else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
{
- _func_float = &mul_F16_F16_F16_n<false, false>;
+ _func_float = &mul_F16_F16_F16_n;
_func_int = nullptr;
}
else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
{
- _func_float = &mul_F32_F32_F32_n<false, false>;
+ _func_float = &mul_F32_F32_F32_n;
_func_int = nullptr;
}
else
@@ -581,9 +655,20 @@
Iterator input2(_input2, slice_input2);
Iterator output(_output, slice);
- if(_func_int != nullptr)
+ if(_func_qasymm8 != nullptr)
{
- execute_window_loop(collapsed, [&](const Coordinates & id)
+ execute_window_loop(collapsed, [&](const Coordinates &)
+ {
+ (*_func_qasymm8)(input1.ptr(), input2.ptr(), output.ptr(), _scale,
+ _input1->info()->quantization_info(), _input2->info()->quantization_info(), _output->info()->quantization_info());
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
+ },
+ input1, input2, output);
+ }
+ else if(_func_int != nullptr)
+ {
+ execute_window_loop(collapsed, [&](const Coordinates &)
{
(*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent);
collapsed.slide_window_slice_3D(slice_input1);
@@ -594,7 +679,7 @@
else
{
ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
- execute_window_loop(collapsed, [&](const Coordinates & id)
+ execute_window_loop(collapsed, [&](const Coordinates &)
{
(*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale);
collapsed.slide_window_slice_3D(slice_input1);
@@ -608,5 +693,113 @@
{
const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
- return BorderSize(0, border, 0, 0);
+ return BorderSize{ 0, border, 0, 0 };
}
+
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration_complex = 2;
+
+Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);
+
+ const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+ // Validate in case of configured output
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
+
+ // Auto initialize output if not initialized
+ const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
+ auto_init_if_empty(*output, out_info);
+
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
+ Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+ Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+ AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access)
+ || update_window_and_padding(win_input2, input2_access)
+ || update_window_and_padding(win, output_access);
+
+ output_access.set_valid_region(win, valid_region);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+NEComplexPixelWiseMultiplicationKernel::NEComplexPixelWiseMultiplicationKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEComplexPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ INEKernel::configure(win_config.second);
+}
+
+Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+
+ return Status{};
+}
+
+void NEComplexPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ Iterator input1(_input1, window.broadcast_if_dimension_le_one(_input1->info()->tensor_shape()));
+ Iterator input2(_input2, window.broadcast_if_dimension_le_one(_input2->info()->tensor_shape()));
+ Iterator output(_output, window);
+
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ c_mul_F32_F32_F32_n(input1.ptr(), input2.ptr(), output.ptr());
+ },
+ input1, input2, output);
+}
+
+BorderSize NEComplexPixelWiseMultiplicationKernel::border_size() const
+{
+ const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
+ return { 0, border, 0, 0 };
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index d00a4af..ac2ffa1 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -138,7 +138,6 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
|| (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
}
@@ -353,7 +352,8 @@
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
// Check output dimensions
- unsigned int pooled_w, pooled_h;
+ unsigned int pooled_w;
+ unsigned int pooled_h;
std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
input->info()->dimension(idx_height),
pool_size.x(),
@@ -640,6 +640,15 @@
}
}
+ const QuantizationInfo &input_qinfo = _input->info()->quantization_info();
+ const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+ if(input_qinfo != output_qinfo)
+ {
+ const auto requantized_output = vquantize(vdequantize(vcombine_u8(lower_res, upper_res), input_qinfo), output_qinfo);
+ lower_res = vget_low_u8(requantized_output);
+ upper_res = vget_high_u8(requantized_output);
+ }
+
// Store result
if(pool_stride_x == 1)
{
@@ -805,6 +814,9 @@
const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+ const QuantizationInfo &input_qinfo = _input->info()->quantization_info();
+ const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+
const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
@@ -814,6 +826,8 @@
const auto top_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
+ uint8x8_t fres = {};
+ uint8x16_t fqres = {};
if(pooling_type == PoolingType::AVG)
{
@@ -869,7 +883,7 @@
scale_vector_s16x8(exclude_padding, res, id, 0, 1,
pool_size, upper_bound_w, upper_bound_h,
pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
+ fres = vmovn_u16(res);
}
else
{
@@ -881,8 +895,7 @@
scale_vector_s16x8(exclude_padding, final_sum.val[1], id, 8, 1,
pool_size, upper_bound_w, upper_bound_h,
pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
- vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+ fqres = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
}
}
else
@@ -896,14 +909,31 @@
{
const uint8x8x2_t table = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
- const uint8x8_t res = vtbl2_u8(table, lookup_val);
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+ fres = vtbl2_u8(table, lookup_val);
}
else
{
- vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
+ fqres = final_max;
}
}
+
+ // Store result
+ if(pool_stride_x == 1)
+ {
+ if(input_qinfo != output_qinfo)
+ {
+ fqres = vquantize(vdequantize(fqres, input_qinfo), output_qinfo);
+ }
+ vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), fqres);
+ }
+ else
+ {
+ if(input_qinfo != output_qinfo)
+ {
+ fres = vquantize(vdequantize(fres, input_qinfo), output_qinfo);
+ }
+ vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), fres);
+ }
},
input, output);
}
@@ -1641,6 +1671,11 @@
}
// Store result
+ const QuantizationInfo &input_qinfo = _input->info()->quantization_info();
+ const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
+ res = (input_qinfo != output_qinfo) ? sqcvt_qasymm8_f32(scvt_f32_qasymm8(res, input_qinfo.scale, input_qinfo.offset), output_qinfo.scale,
+ output_qinfo.offset) :
+ res;
*(reinterpret_cast<uint8_t *>(output.ptr())) = res;
},
input, output);
@@ -1663,7 +1698,9 @@
const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
- const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
+ const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
+ const QuantizationInfo &input_qinfo = _input->info()->quantization_info();
+ const QuantizationInfo &output_qinfo = _output->info()->quantization_info();
execute_window_loop(window, [&](const Coordinates & id)
{
@@ -1713,6 +1750,12 @@
uint8x8_t res1 = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2)));
uint8x8_t res2 = vmovn_u16(vcombine_u16(vmovn_u32(vres3), vmovn_u32(vres4)));
+ if(input_qinfo != output_qinfo)
+ {
+ const auto requantized_output = vquantize(vdequantize(vcombine_u8(res1, res2), input_qinfo), output_qinfo);
+ res1 = vget_low_u8(requantized_output);
+ res2 = vget_high_u8(requantized_output);
+ }
// Store result
vst1_u8(output.ptr(), res1);
@@ -1733,7 +1776,7 @@
}
// Store result
- vst1q_u8(output.ptr(), vres);
+ vst1q_u8(output.ptr(), (input_qinfo != output_qinfo) ? vquantize(vdequantize(vres, input_qinfo), output_qinfo) : vres);
}
},
input, output);
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index b49400a..4deeb1c 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,152 +23,140 @@
*/
#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/CPP/Validate.h"
+
#include <arm_neon.h>
using namespace arm_compute;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
- if(output->tensor_shape().total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- }
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+inline float32x4x4_t load_value(const float *input_ptr)
{
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8);
-
- constexpr unsigned int num_elems_processed_per_iteration = 8;
-
- // Configure window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- AccessWindowStatic min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
-
- // Update window and padding
- bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
-
- output_access.set_valid_region(win, input->valid_region());
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_tuple(err, win);
+ return { wrapper::vloadq(input_ptr),
+ wrapper::vloadq(input_ptr + 4),
+ wrapper::vloadq(input_ptr + 8),
+ wrapper::vloadq(input_ptr + 12) };
}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline const float32x4x4_t load_value(const float16_t *input_ptr)
+{
+ return { vcvt_f32_f16(wrapper::vload(input_ptr)),
+ vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+ vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
+ vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
+}
+
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
} // namespace
NEQuantizationLayerKernel::NEQuantizationLayerKernel()
- : _input(nullptr), _output(nullptr), _min_max(nullptr)
+ : _input(nullptr), _output(nullptr)
{
}
-void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
- _input = input;
- _output = output;
- _min_max = min_max;
+ _input = input;
+ _output = output;
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
+ Window win_config = calculate_max_window(*input->info(), Steps());
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
- INEKernel::configure(std::get<1>(win_config));
+ INEKernel::configure(win_config);
}
-Status NEQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status NEQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
return Status{};
}
+template <typename T>
+void NEQuantizationLayerKernel::quantize(const Window &window, const QuantizationInfo &qinfo)
+{
+ constexpr auto window_step = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+#ifdef __aarch64__
+ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else //__aarch64__
+ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(_input, win_collapsed);
+ Iterator output(_output, win_collapsed);
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
+ {
+ auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step); x += window_step)
+ {
+ wrapper::vstore(&output_ptr[x], vquantize(load_value(&input_ptr[x]), qinfo));
+ }
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ output_ptr[x] = qinfo.quantize(input_ptr[x], rounding_policy);
+ }
+ },
+ input, output);
+}
+
void NEQuantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- Window window_input_output(window);
- window_input_output.set(3, Window::Dimension(0, 1, 1));
+ const QuantizationInfo &qinfo = _output->info()->quantization_info();
- Window window_min_max;
- window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
- window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(_input, window_input_output);
- Iterator output(_output, window_input_output);
- Iterator min_max(_min_max, window_min_max);
-
- execute_window_loop(window_min_max, [&](const Coordinates & id_batch)
+ switch(_input->info()->data_type())
{
- // Get the min and max
- float min = *(reinterpret_cast<const float *>(min_max.ptr()) + 0);
- float max = *(reinterpret_cast<const float *>(min_max.ptr()) + 1);
-
- // Saturate the result if min = max
- if(min == max)
- {
- min = 0.0f;
- max = 1.0f;
- }
-
- const float32x4_t vmin = vdupq_n_f32(min);
- const float32x4_t inv_range = vdupq_n_f32(1.0f / (max - min));
- const float32x4_t quantization_max = vdupq_n_f32(255.0f);
- const float32x4_t quantization_mul = vdupq_n_f32(256.0f);
-
- // Uniformly map values to range 8bit integers, i.e. [min, max] -> [0, 255]
- execute_window_loop(window_input_output, [&](const Coordinates & id)
- {
- // Get the input values
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
- float32x4x2_t val = vld2q_f32(input_ptr);
-
- // Map float values to range [0.0, 1.0]
- val.val[0] = vsubq_f32(val.val[0], vmin);
- val.val[1] = vsubq_f32(val.val[1], vmin);
- val.val[0] = vmulq_f32(val.val[0], inv_range);
- val.val[1] = vmulq_f32(val.val[1], inv_range);
-
- // Quantize
- val.val[0] = vmulq_f32(val.val[0], quantization_mul);
- val.val[1] = vmulq_f32(val.val[1], quantization_mul);
- val.val[0] = vminq_f32(val.val[0], quantization_max);
- val.val[1] = vminq_f32(val.val[1], quantization_max);
-
- const uint32x4_t val_u32_low = vcvtq_u32_f32(val.val[0]);
- const uint32x4_t val_u32_high = vcvtq_u32_f32(val.val[1]);
- const uint16x4x2_t val_u16 = vzip_u16(vmovn_u32(val_u32_low), vmovn_u32(val_u32_high));
-
- const uint8x8_t quantized = vmovn_u16(vcombine_u16(val_u16.val[0], val_u16.val[1]));
-
- // Store the quantized values
- auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr() + id_batch[1] * _output->info()->strides_in_bytes()[3]);
- vst1_u8(output_ptr, quantized);
- },
- input, output);
- },
- min_max);
+ case DataType::F32:
+ NEQuantizationLayerKernel::quantize<float>(window, qinfo);
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ NEQuantizationLayerKernel::quantize<float16_t>(window, qinfo);
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
}
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 84cb223..aa20d1f 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -602,7 +602,7 @@
{
ARM_COMPUTE_UNUSED(out_slice);
- execute_window_loop(in_slice, [&](const Coordinates & id)
+ execute_window_loop(in_slice, [&](const Coordinates &)
{
neon_vector vec_res_value = { 0 };
if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
@@ -688,13 +688,70 @@
}
};
+template <typename T, int S, int axis, ReductionOperation op>
+struct RedOpYZW_complex
+{
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+ using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
+
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int, const ReductionOperation)
+ {
+ ARM_COMPUTE_UNUSED(out_slice);
+ ARM_COMPUTE_ERROR_ON(axis != 2);
+
+ const size_t stride_z = in_info.strides_in_bytes()[axis];
+
+ execute_window_loop(in_slice, [&](const Coordinates &)
+ {
+ neon_vector vec_res_value_0 = { 0 };
+ neon_vector vec_res_value_1 = { 0 };
+
+ vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+ for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ T *in_ptr_0;
+ T *in_ptr_1;
+ switch(axis)
+ {
+ case 2:
+ in_ptr_0 = reinterpret_cast<T *>(input.ptr() + stride_z * dim);
+ in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 16 + stride_z * dim);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
+ const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+ vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ }
+
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value_0);
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr() + 16), vec_res_value_1);
+
+ },
+ input, output);
+ }
+};
+
struct RedOpYZW_qasymm8
{
inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
- execute_window_loop(in_slice, [&](const Coordinates & id)
+ execute_window_loop(in_slice, [&](const Coordinates &)
{
uint32x4x4_t vec_res_idx{ { 0 } };
auto vec_res_value1 = vdupq_n_u32(0);
@@ -848,6 +905,31 @@
void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
{
+ const bool is_complex = (input->info()->num_channels() == 2);
+
+ if(is_complex)
+ {
+ switch(axis)
+ {
+ case 2:
+ switch(input->info()->data_type())
+ {
+ case DataType::F32:
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ }
+
switch(axis)
{
case 0:
@@ -917,7 +999,17 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+
+ if(input->num_channels() == 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(op != ReductionOperation::SUM);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis != 2);
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
@@ -929,12 +1021,12 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
}
else
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
@@ -952,7 +1044,7 @@
// Output auto initialization if not yet initialized
const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
DataType output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
- auto_init_if_empty(*output, output_shape, 1, output_data_type, input->quantization_info());
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index edb3ffe..3c871de 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -113,8 +113,8 @@
AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal mapx_access(map_x->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal mapy_access(map_y->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal mapx_access(map_x->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal mapy_access(map_y->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access);
@@ -140,7 +140,7 @@
const float32x4_t height = vdupq_n_f32(static_cast<float>(_input->info()->dimension(1)));
const int32x4_t in_stride = vdupq_n_s32(static_cast<int32_t>(_input->info()->strides_in_bytes()[1]));
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const auto mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
const auto mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
@@ -190,7 +190,7 @@
const size_t height = _input->info()->dimension(1);
const size_t in_stride = _input->info()->strides_in_bytes()[1];
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const auto mapx_ptr = reinterpret_cast<float *>(mapx.ptr());
const auto mapy_ptr = reinterpret_cast<float *>(mapy.ptr());
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index 8baea2b..ece5aa4 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -67,9 +67,47 @@
}
} // namespace
-template <typename T>
-void NEReorgLayerKernel::run_reorg(const Window &window)
+NEReorgLayerKernel::NEReorgLayerKernel()
+ : _input(nullptr), _output(nullptr), _stride(1)
{
+}
+
+void NEReorgLayerKernel::configure(const ITensor *input, ITensor *output, int32_t stride)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
+
+ _input = input;
+ _output = output;
+ _stride = stride;
+
+ // The NEReorgLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ ICPPKernel::configure(win);
+}
+
+Status NEReorgLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
+ return Status{};
+}
+
+void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
const DataLayout data_layout = _input->info()->data_layout();
const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -101,72 +139,8 @@
map_coords.set(idx_c, c % out_c);
// Perform mapping
- *(reinterpret_cast<T *>(out.ptr())) = *(reinterpret_cast<const T *>(in_ptr + _input->info()->offset_element_in_bytes(map_coords)));
+ std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
},
out);
}
-
-NEReorgLayerKernel::NEReorgLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _stride(1)
-{
-}
-
-void NEReorgLayerKernel::configure(const ITensor *input, ITensor *output, int32_t stride)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto inizialitation if not yet initialized
- const TensorShape output_shape = misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride);
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
-
- _func = nullptr;
- _input = input;
- _output = output;
- _stride = stride;
-
- switch(input->info()->element_size())
- {
- case 1:
- _func = &NEReorgLayerKernel::run_reorg<uint8_t>;
- break;
- case 2:
- _func = &NEReorgLayerKernel::run_reorg<uint16_t>;
- break;
- case 4:
- _func = &NEReorgLayerKernel::run_reorg<uint32_t>;
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
- }
-
- // The NEReorgLayerKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
-
- ICPPKernel::configure(win);
-}
-
-Status NEReorgLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
- return Status{};
-}
-
-void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
- if(_func != nullptr)
- {
- (this->*_func)(window);
- }
-}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 62e4882..36398cf 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -189,31 +189,21 @@
switch(_input->info()->data_type())
{
case DataType::F32:
- run_reverse<float>(window, _input, _axis, _output);
+ case DataType::U32:
+ case DataType::S32:
+ run_reverse<uint32_t>(window, _input, _axis, _output);
break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- run_reverse<float16_t>(window, _input, _axis, _output);
- break;
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::U32:
- run_reverse<uint32_t>(window, _input, _axis, _output);
- break;
- case DataType::S32:
- run_reverse<int32_t>(window, _input, _axis, _output);
- break;
case DataType::S16:
- run_reverse<int16_t>(window, _input, _axis, _output);
- break;
case DataType::U16:
run_reverse<uint16_t>(window, _input, _axis, _output);
break;
case DataType::QASYMM8:
case DataType::U8:
- run_reverse<uint8_t>(window, _input, _axis, _output);
- break;
case DataType::S8:
- run_reverse<int8_t>(window, _input, _axis, _output);
+ run_reverse<uint8_t>(window, _input, _axis, _output);
break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 3d300ef..3354039 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -45,7 +45,7 @@
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
- BorderMode border_mode, SamplingPolicy sampling_policy)
+ BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8);
@@ -53,7 +53,8 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(output == input);
ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
- ARM_COMPUTE_UNUSED(border_mode);
+ ARM_COMPUTE_RETURN_ERROR_ON(!use_padding && border_mode != BorderMode::CONSTANT);
+ ARM_COMPUTE_UNUSED(constant_border_value);
const DataLayout data_layout = input->data_layout();
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)) == 0);
@@ -121,40 +122,44 @@
std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output,
InterpolationPolicy policy, bool border_undefined,
- SamplingPolicy sampling_policy, BorderSize border_size)
+ SamplingPolicy sampling_policy, BorderSize border_size, bool use_padding)
{
bool window_changed{ false };
Window win{};
- const unsigned int num_elems_processed_per_iteration = (policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
+ const unsigned int num_elems_processed_per_iteration = (use_padding && policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
// Configure kernel window
win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input, 0, -border_size.top,
- ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration),
- input->tensor_shape()[1]);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- window_changed = update_window_and_padding(win, input_access, output_access);
- output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(),
- policy, sampling_policy, border_undefined));
+ if(use_padding)
+ {
+ AccessWindowStatic input_access(input, 0, -border_size.top, use_padding ? ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration) : num_elems_processed_per_iteration,
+ input->tensor_shape()[1]);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(), policy, sampling_policy, border_undefined));
+ }
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
- InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size)
+ InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size, bool use_padding)
{
std::pair<Status, Window> win_config;
switch(input->data_layout())
{
case DataLayout::NCHW:
+ if(!use_padding)
+ {
+ return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Padding required for NCHW"), Window{});
+ }
win_config = validate_and_configure_window_nchw(input, dx, dy, offsets, output, policy, border_undefined, sampling_policy, border_size);
break;
case DataLayout::NHWC:
- win_config = validate_and_configure_window_nhwc(input, output, policy, border_undefined, sampling_policy, border_size);
+ win_config = validate_and_configure_window_nhwc(input, output, policy, border_undefined, sampling_policy, border_size, use_padding);
break;
default:
win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
@@ -167,6 +172,12 @@
inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output,
float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c)
{
+ const int window_step_x = 16 / sizeof(T);
+ const auto window_start_x = static_cast<int32_t>(window.x().start());
+ const auto window_end_x = static_cast<int32_t>(window.x().end());
+
+ window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
Iterator in(input, win_in);
Iterator out(output, window);
@@ -174,18 +185,28 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_yi = (id.z() + 0.5f) * hr;
- const int offset_row = in_yi * stride_h + id.x() * stride_c;
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()),
- wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row)));
+ const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int in_yi = (id.z() + 0.5f) * hr;
+ const int offset_row = in_yi * stride_h;
+ int32_t x = window_start_x;
+ for(; x < window_end_x - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
+ wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row + x * stride_c)));
+ }
+ for(; x < window_end_x; ++x)
+ {
+ *(reinterpret_cast<T *>(out.ptr()) + x) =
+ *(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row + x * stride_c));
+ }
},
in, out);
}
-template <typename T>
+template <typename T, typename ConstType>
inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output,
- float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
+ float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h,
+ size_t stride_c, BorderMode border_mode, PixelValue constant_border_value, bool use_padding)
{
Iterator in(input, win_in);
Iterator out(output, window);
@@ -196,7 +217,15 @@
const int input_width = input->info()->dimension(1);
const int input_height = input->info()->dimension(2);
- const T *border_area = reinterpret_cast<T *>(input->buffer() + input->info()->offset_first_element_in_bytes() - stride_w);
+ T border_value;
+ if(use_padding)
+ {
+ border_value = *reinterpret_cast<T *>(input->buffer() + input->info()->offset_first_element_in_bytes() - stride_w);
+ }
+ else
+ {
+ border_value = static_cast<T>(constant_border_value.get<ConstType>());
+ }
auto is_valid = [](int x, int low_x, int high_x, int y, int low_y, int high_y)
{
@@ -220,14 +249,17 @@
if(is_valid(offset, -border_size, input_width - 1 + border_size, in_yi, -border_size, input_height - 1 + border_size))
{
- T a00 = 0, a01 = 0, a10 = 0, a11 = 0;
+ T a00 = 0;
+ T a01 = 0;
+ T a10 = 0;
+ T a11 = 0;
if(border_mode == BorderMode::CONSTANT)
{
- a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : *border_area;
- a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : *border_area;
- a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : *border_area;
- a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : *border_area;
+ a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : border_value;
+ a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : border_value;
+ a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : border_value;
+ a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : border_value;
}
else if(border_mode == BorderMode::REPLICATE)
{
@@ -279,7 +311,7 @@
{
if(border_mode == BorderMode::CONSTANT)
{
- *reinterpret_cast<T *>(out.ptr()) = *border_area;
+ *reinterpret_cast<T *>(out.ptr()) = border_value;
}
else if(border_mode == BorderMode::REPLICATE)
{
@@ -294,7 +326,8 @@
} // namespace
NEScaleKernel::NEScaleKernel()
- : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _sampling_offset(0)
+ : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _constant_border_value(PixelValue()),
+ _sampling_offset(0), _use_padding(true)
{
}
@@ -304,31 +337,33 @@
}
void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets,
- ITensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
+ ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy,
+ bool use_padding)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
dx != nullptr ? dx->info() : nullptr,
dy != nullptr ? dy->info() : nullptr,
offsets != nullptr ? offsets->info() : nullptr,
output->info(),
- policy, border_mode, sampling_policy));
+ policy, border_mode, constant_border_value, sampling_policy, use_padding));
// Get data layout and width/height indices
const DataLayout data_layout = input->info()->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- _input = input;
- _output = output;
- _offsets = offsets;
- _dx = dx;
- _dy = dy;
- _policy = policy;
- _border_size = BorderSize(1);
- _border_mode = border_mode;
+ _input = input;
+ _output = output;
+ _offsets = offsets;
+ _dx = dx;
+ _dy = dy;
+ _policy = policy;
+ _border_size = BorderSize(1);
+ _border_mode = border_mode;
+ _constant_border_value = constant_border_value;
+ _use_padding = use_padding;
if(sampling_policy == SamplingPolicy::CENTER)
{
@@ -342,7 +377,7 @@
// Add constant border only on top in case of NHWC layout
if(data_layout == DataLayout::NHWC)
{
- _border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
+ _border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR && use_padding) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
}
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
@@ -379,7 +414,8 @@
dy != nullptr ? dy->info() : nullptr,
offsets != nullptr ? offsets->info() : nullptr,
output->info(),
- policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size());
+ policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size(), use_padding);
+
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
@@ -904,8 +940,8 @@
}
else
{
- scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
- window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+ scale_bilinear_nhwc_core<uint8_t, uint8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
}
break;
}
@@ -917,8 +953,8 @@
}
else
{
- scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
- window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+ scale_bilinear_nhwc_core<int16_t, int16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
}
break;
}
@@ -932,8 +968,8 @@
}
else
{
- scale_bilinear_nhwc_core<float16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
- window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+ scale_bilinear_nhwc_core<float16_t, half>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
}
break;
}
@@ -946,8 +982,8 @@
}
else
{
- scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
- window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+ scale_bilinear_nhwc_core<float, float>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
+ window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
}
break;
}
@@ -959,7 +995,7 @@
Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
- BorderMode border_mode, SamplingPolicy sampling_policy)
+ BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
{
BorderSize border_size(1);
if(input->data_layout() == DataLayout::NHWC)
@@ -967,13 +1003,13 @@
border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
}
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, policy, border_mode, sampling_policy));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
dx != nullptr ? dx->clone().get() : nullptr,
dy != nullptr ? dy->clone().get() : nullptr,
offsets != nullptr ? offsets->clone().get() : nullptr,
output->clone().get(),
- policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size)
+ policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size, use_padding)
.first);
return Status{};
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
index f23c31b..3add699 100644
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -161,7 +161,7 @@
if(_run_scharr_x && _run_scharr_y)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
@@ -197,7 +197,7 @@
}
else if(_run_scharr_x)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
@@ -232,7 +232,7 @@
}
else if(_run_scharr_y)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index f2697bc..c03e5f0 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -54,7 +54,7 @@
Iterator input2(in2, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win, [&](const Coordinates &)
{
auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
index 5a80630..7a27203 100644
--- a/src/core/NEON/kernels/NESobel3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,7 +42,7 @@
BorderSize NESobel3x3Kernel::border_size() const
{
- return BorderSize(1);
+ return BorderSize{ 1 };
}
void NESobel3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
@@ -117,7 +117,7 @@
if(_run_sobel_y && _run_sobel_x)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -181,7 +181,7 @@
}
else if(_run_sobel_x)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
@@ -229,7 +229,7 @@
}
else if(_run_sobel_y)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
index 30e7817..a92cfc2 100644
--- a/src/core/NEON/kernels/NESobel5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -120,7 +120,7 @@
static const int16x8_t two = vdupq_n_s16(2);
static const int16x8_t minustwo = vdupq_n_s16(-2);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -154,7 +154,7 @@
static const int16x8_t two = vdupq_n_s16(2);
static const int16x8_t minustwo = vdupq_n_s16(-2);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -180,7 +180,7 @@
static const int16x8_t six = vdupq_n_s16(6);
static const int16x8_t four = vdupq_n_s16(4);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
@@ -211,7 +211,7 @@
BorderSize NESobel5x5VertKernel::border_size() const
{
- return BorderSize(2, 0);
+ return BorderSize{ 2, 0 };
}
void NESobel5x5VertKernel::configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
@@ -312,7 +312,7 @@
if(_run_sobel_x)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
// Convert offset from uint8_t* to uint16_t*
const size_t input_offset_high_s16 = input_x.offset() / 2;
@@ -361,7 +361,7 @@
if(_run_sobel_y)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
// Convert offset from uint8_t* to uint16_t*
const size_t input_offset_high_s16 = input_y.offset() / 2;
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
index 40a3e31..f2b42cc 100644
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -215,7 +215,7 @@
if(_run_sobel_y && _run_sobel_x)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr() - 3);
@@ -244,7 +244,7 @@
}
else if(_run_sobel_x)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr() - 3);
@@ -269,7 +269,7 @@
}
else if(_run_sobel_y)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr() - 3);
@@ -301,7 +301,7 @@
BorderSize NESobel7x7VertKernel::border_size() const
{
- return BorderSize(3, 0);
+ return BorderSize{ 3, 0 };
}
void NESobel7x7VertKernel::configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
@@ -382,7 +382,7 @@
if(_run_sobel_x)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
auto in_ptr = reinterpret_cast<int32_t *>(input_x.ptr()) - 3 * in_x_stride;
@@ -453,7 +453,7 @@
if(_run_sobel_y)
{
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
auto in_ptr = reinterpret_cast<int32_t *>(input_y.ptr()) - 3 * in_y_stride;
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
new file mode 100644
index 0000000..2e46b14
--- /dev/null
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *padddings, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(padddings->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(padddings->tensor_shape()[1] != block_info->tensor_shape()[0]);
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ const DataLayout data_layout = input->data_layout();
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_shape_x != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_shape_y != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel()
+ : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _padding_left(), _block_shape_x(), _block_shape_y()
+{
+}
+
+void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+
+ _input = input;
+ _block_shape = block_shape;
+ _paddings = paddings;
+ _output = output;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+ ICPPKernel::configure(win);
+}
+
+void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+
+ _input = input;
+ _output = output;
+ _block_shape_x = block_shape_x;
+ _block_shape_y = block_shape_y;
+ _padding_left = padding_left;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+ INEKernel::configure(win);
+}
+
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
+ return Status{};
+}
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ return Status{};
+}
+
+void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+ if(_block_shape != nullptr)
+ {
+ // Retrieve the block shapes dynamically
+ _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
+ _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
+ }
+
+ if(_paddings != nullptr)
+ {
+ const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 0, 0 }));
+ const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 1, 0 }));
+ _padding_left = Size2D(pad_left_x, pad_left_y);
+ }
+ const DataLayout data_layout = _input->info()->data_layout();
+ const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int element_size = _input->info()->element_size();
+
+ const size_t height = _input->info()->dimension(height_idx);
+ const size_t width = _input->info()->dimension(width_idx);
+ const size_t batch_size = _input->info()->dimension(3);
+
+ Window slice_out = window.first_slice_window_3D();
+ Window slice_in = window.first_slice_window_4D();
+
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ int batch_id = 0;
+
+ // Main loop for NCHW and NHWC
+ if(_output->info()->data_layout() == DataLayout::NCHW)
+ {
+ do
+ {
+ Iterator out(_output, slice_out);
+ execute_window_loop(slice_out, [&](const Coordinates & id)
+ {
+ const size_t out_x = id.x();
+ const size_t out_y = id.y();
+ const size_t z = id.z();
+ const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+ const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+ if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ {
+ const int w = batch_id % batch_size;
+ const int in_x = pos_x - _padding_left.x();
+ const int in_y = pos_y - _padding_left.y();
+ Coordinates input_coords{ in_x, in_y, z, w };
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ }
+ },
+ out);
+ ++batch_id;
+ }
+ while(window.slide_window_slice_3D(slice_out));
+ }
+ else
+ {
+ do
+ {
+ Iterator out(_output, slice_out);
+ execute_window_loop(slice_out, [&](const Coordinates & id)
+ {
+ const size_t out_x = id.y();
+ const size_t out_y = id.z();
+ const size_t z = id.x();
+ const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+ const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+ if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ {
+ const int w = batch_id % batch_size;
+ const int in_x = pos_x - _padding_left.x();
+ const int in_y = pos_y - _padding_left.y();
+ Coordinates input_coords{ z, in_x, in_y, w };
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ }
+ },
+ out);
+ ++batch_id;
+ }
+ while(window.slide_window_slice_3D(slice_out));
+ }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 0c33f36..3447d59 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -87,7 +87,7 @@
} // namespace
NEStackLayerKernel::NEStackLayerKernel()
- : _input(nullptr), _output(nullptr), _axis(), _idx_input(), _func(nullptr)
+ : _input(nullptr), _output(nullptr), _axis(), _idx_input()
{
}
@@ -101,22 +101,6 @@
_axis = axis;
_idx_input = idx_input;
- switch(input->info()->element_size())
- {
- case 1:
- _func = &NEStackLayerKernel::run_stack<uint8_t>;
- break;
- case 2:
- _func = &NEStackLayerKernel::run_stack<uint16_t>;
- break;
- case 4:
- _func = &NEStackLayerKernel::run_stack<uint32_t>;
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
- }
-
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
@@ -137,15 +121,6 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- if(_func != nullptr)
- {
- (this->*_func)(window);
- }
-}
-
-template <typename T>
-void NEStackLayerKernel::run_stack(const Window &window)
-{
Window window_out;
window_out.use_tensor_dimensions(_output->info()->tensor_shape());
@@ -160,9 +135,9 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
- const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
- *(reinterpret_cast<T *>(output.ptr() + idx)) = *(reinterpret_cast<const T *>(input.ptr()));
+ Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+ const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
+ std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
},
input);
}
diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
index 958f4a9..536c220 100644
--- a/src/core/NEON/kernels/NETableLookupKernel.cpp
+++ b/src/core/NEON/kernels/NETableLookupKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,7 @@
Iterator input = Iterator(_input, window);
Iterator output = Iterator(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
auto input_ptr = reinterpret_cast<const T *>(input.ptr());
auto output_ptr = reinterpret_cast<T *>(output.ptr());
@@ -92,7 +92,7 @@
Iterator input = Iterator(_input, window);
Iterator output = Iterator(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8_t *input_ptr = input.ptr();
uint8_t *output_ptr = output.ptr();
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
index 5ef0693..ae9c62b 100644
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,7 +86,7 @@
Iterator input(_input, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
const uint8x16_t mask = vcgtq_u8(data, threshold);
@@ -106,7 +106,7 @@
Iterator input(_input, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(window, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(input.ptr());
diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
index aae85c6..d3d88b3 100644
--- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
@@ -130,7 +130,7 @@
Iterator output(_output, window_out);
const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float);
- execute_window_loop(window_out, [&](const Coordinates & id)
+ execute_window_loop(window_out, [&](const Coordinates &)
{
const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr()));
const float32x4_t data_out1 = { vgetq_lane_f32(data, 0), vgetq_lane_f32(data, 0), vgetq_lane_f32(data, 1), vgetq_lane_f32(data, 1) };
@@ -157,7 +157,7 @@
const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float);
const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(float);
- execute_window_loop(window_out, [&](const Coordinates & id)
+ execute_window_loop(window_out, [&](const Coordinates &)
{
const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr()));
auto out = reinterpret_cast<float *>(output.ptr());
@@ -182,7 +182,7 @@
Iterator output(_output, window_out);
const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(uint8_t);
- execute_window_loop(window_out, [&](const Coordinates & id)
+ execute_window_loop(window_out, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr()));
const uint8x16_t data_out1 = { vgetq_lane_u8(data, 0), vgetq_lane_u8(data, 0), vgetq_lane_u8(data, 1), vgetq_lane_u8(data, 1),
@@ -218,7 +218,7 @@
const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(uint8_t);
const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(uint8_t);
- execute_window_loop(window_out, [&](const Coordinates & id)
+ execute_window_loop(window_out, [&](const Coordinates &)
{
const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr()));
auto out = reinterpret_cast<uint8_t *>(output.ptr());
@@ -245,7 +245,7 @@
Iterator output(_output, window_out);
const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float16_t);
- execute_window_loop(window_out, [&](const Coordinates & id)
+ execute_window_loop(window_out, [&](const Coordinates &)
{
const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr()));
const float16x8_t data_out1 = { vgetq_lane_f16(data, 0), vgetq_lane_f16(data, 0), vgetq_lane_f16(data, 1), vgetq_lane_f16(data, 1),
@@ -278,7 +278,7 @@
const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float16_t);
const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(float16_t);
- execute_window_loop(window_out, [&](const Coordinates & id)
+ execute_window_loop(window_out, [&](const Coordinates &)
{
const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr()));
auto out = reinterpret_cast<float16_t *>(output.ptr());
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 4a0cf27..624833a 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -34,59 +34,6 @@
namespace
{
-template <typename T>
-void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
-{
- const unsigned int kernel_size_x = input->info()->dimension(0);
- const unsigned int kernel_size_y = input->info()->dimension(1);
- const unsigned int kernel_depth = input->info()->dimension(2);
- const unsigned int input_stride_x = input->info()->strides_in_bytes().x();
- const unsigned int input_stride_y = input->info()->strides_in_bytes().y();
- const unsigned int input_stride_z = input->info()->strides_in_bytes().z();
- const unsigned int output_stride_y = output->info()->strides_in_bytes().y();
-
- // Create iterators
- Iterator in(input, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Get column index
- const int kernel_idx = id[3];
- const int kernel_idz = id[4];
-
- // Setup pointers
- const uint8_t *tmp_input_ptr = in.ptr();
- uint8_t *tmp_output_ptr = output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
- const uint8_t *curr_input_row_ptr = tmp_input_ptr;
- const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
- // Linearize volume
- for(unsigned int d = 0; d < kernel_depth; ++d)
- {
- for(unsigned int j = 0; j < kernel_size_y; ++j)
- {
- for(unsigned int i = 0; i < kernel_size_x; ++i)
- {
- *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
- tmp_input_ptr += input_stride_x;
- tmp_output_ptr += output_stride_y;
- }
- curr_input_row_ptr += input_stride_y;
- tmp_input_ptr = curr_input_row_ptr;
- }
- curr_input_depth_ptr += input_stride_z;
- curr_input_row_ptr = curr_input_depth_ptr;
- tmp_input_ptr = curr_input_depth_ptr;
- }
-
- // Add bias
- if(bias != nullptr)
- {
- *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz))));
- }
- },
- in);
-}
-
TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
{
TensorShape output_shape{ input->tensor_shape() };
@@ -141,7 +88,7 @@
} // namespace
NEWeightsReshapeKernel::NEWeightsReshapeKernel()
- : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+ : _input(nullptr), _bias(nullptr), _output(nullptr)
{
}
@@ -161,30 +108,6 @@
_bias = bias;
_output = output;
- switch(_input->info()->element_size())
- {
- case 4:
- {
- _func = &weights_reshape<uint32_t>;
- break;
- }
- case 2:
- {
- _func = &weights_reshape<uint16_t>;
- break;
- }
- case 1:
- {
- _func = &weights_reshape<uint8_t>;
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR_ON("Element size not supported");
- break;
- }
- }
-
// Configure kernel
auto win_config = validate_and_configure_window(input->info(), output->info());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
@@ -205,5 +128,52 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- (*_func)(_input, _bias, _output, window);
+ const unsigned int kernel_size_x = _input->info()->dimension(0);
+ const unsigned int kernel_size_y = _input->info()->dimension(1);
+ const unsigned int kernel_depth = _input->info()->dimension(2);
+ const unsigned int input_stride_x = _input->info()->strides_in_bytes().x();
+ const unsigned int input_stride_y = _input->info()->strides_in_bytes().y();
+ const unsigned int input_stride_z = _input->info()->strides_in_bytes().z();
+ const unsigned int output_stride_y = _output->info()->strides_in_bytes().y();
+
+ // Create iterators
+ Iterator in(_input, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get column index
+ const int kernel_idx = id[3];
+ const int kernel_idz = id[4];
+
+ // Setup pointers
+ const uint8_t *tmp_input_ptr = in.ptr();
+ uint8_t *tmp_output_ptr = _output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+ const uint8_t *curr_input_row_ptr = tmp_input_ptr;
+ const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+ // Linearize volume
+ for(unsigned int d = 0; d < kernel_depth; ++d)
+ {
+ for(unsigned int j = 0; j < kernel_size_y; ++j)
+ {
+ for(unsigned int i = 0; i < kernel_size_x; ++i)
+ {
+ std::memcpy(tmp_output_ptr, tmp_input_ptr, _input->info()->element_size());
+ tmp_input_ptr += input_stride_x;
+ tmp_output_ptr += output_stride_y;
+ }
+ curr_input_row_ptr += input_stride_y;
+ tmp_input_ptr = curr_input_row_ptr;
+ }
+ curr_input_depth_ptr += input_stride_z;
+ curr_input_row_ptr = curr_input_depth_ptr;
+ tmp_input_ptr = curr_input_depth_ptr;
+ }
+
+ // Add bias
+ if(_bias != nullptr)
+ {
+ std::memcpy(tmp_output_ptr, _bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), _input->info()->element_size());
+ }
+ },
+ in);
}
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
index 3e76a08..263ded0 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -238,8 +238,7 @@
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformWeightsKernel()
- : _weights_hwio(nullptr), _output(nullptr), _matrix_stride(0), _num_output_channels(0), _num_input_channels(0)
-
+ : _transform(nullptr), _weights_hwio(nullptr), _output(nullptr), _matrix_stride(0), _num_output_channels(0), _num_input_channels(0)
{
}
@@ -263,11 +262,10 @@
_matrix_stride = matrix_stride;
_num_output_channels = num_output_channels;
_num_input_channels = num_input_channels;
+ _transform = arm_compute::support::cpp14::make_unique<WeightsTransform>(num_output_channels, num_input_channels);
- const int matrix_row_stride = roundup(num_output_channels, WinogradConv::N_BLOCK);
- WeightsTransform transform(nullptr, nullptr, matrix_stride, matrix_row_stride, num_output_channels, num_input_channels);
- Window win;
- auto win_last = transform.get_window();
+ Window win;
+ auto win_last = _transform->get_window();
win.set(Window::DimX, Window::Dimension(0, win_last, 1));
INEKernel::configure(win);
}
@@ -278,12 +276,14 @@
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ const size_t fst = window.x().start();
+ const size_t lst = window.x().end();
+ _transform->set_weight_tensor(_weights_hwio->buffer());
+ const int matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK);
+ _transform->set_output_matrices(_output->buffer(), _matrix_stride, matrix_row_stride);
+ _transform->set_working_space(_output->buffer());
- const int matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK);
- WeightsTransform transform(reinterpret_cast<T *>(_weights_hwio->buffer()), reinterpret_cast<T *>(_output->buffer()), _matrix_stride, matrix_row_stride, _num_output_channels, _num_input_channels);
- const size_t fst = window.x().start();
- const size_t lst = window.x().end();
- transform.run(fst, lst);
+ _transform->run(fst, lst);
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -331,6 +331,12 @@
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
+{
+ return _transform->get_working_space_size(num_threads) / sizeof(T);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
{
@@ -339,7 +345,8 @@
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformInputKernel()
- : _input_nhwc(), _num_batches(0), _num_rows(0), _num_cols(0), _num_channels(0), _padding(), _output(nullptr), _matrix_stride(0)
+ : _transform(nullptr), _input_nhwc(nullptr), _num_batches(0), _num_rows(0), _num_cols(0), _num_channels(0), _padding(), _output(nullptr), _matrix_stride(0), _padding_top(), _padding_left(),
+ _padding_right(), _padding_bottom(), _workspace(nullptr)
{
}
@@ -352,7 +359,8 @@
const int num_channels, /* Number of channels in input tensor. */
const PaddingType padding, /* Padding type. */
ITensor *output, /* Base of output matrices. */
- const int matrix_stride) /* Stride between output matrices. */
+ const int matrix_stride, /* Stride between output matrices. */
+ ITensor *workspace)
{
_input_nhwc = input_nhwc;
_num_batches = num_batches;
@@ -362,9 +370,28 @@
_padding = padding;
_output = output;
_matrix_stride = matrix_stride;
- InputTransform transform(nullptr, num_batches, num_rows, num_cols, num_channels, padding, nullptr, matrix_stride, num_channels);
- Window win;
- auto win_last = transform.get_window();
+ _workspace = workspace;
+
+ _padding_top = (padding == PADDING_SAME) ? (KernelRows - 1) / 2 : 0;
+ _padding_left = (padding == PADDING_SAME) ? (KernelCols - 1) / 2 : 0;
+ _padding_bottom = (padding == PADDING_SAME) ? iceildiv(KernelRows - 1, 2) : 0;
+ _padding_right = (padding == PADDING_SAME) ? iceildiv(KernelCols - 1, 2) : 0;
+
+ _transform = arm_compute::support::cpp14::make_unique<InputTransform>(
+ KernelRows,
+ KernelCols,
+ num_batches,
+ num_rows,
+ num_cols,
+ num_channels,
+ _padding_top, /**< Padding to apply to the top of the image. */
+ _padding_left, /**< Padding to apply to the left of the image. */
+ _padding_bottom, /**< Padding to apply to the bottom of the image. */
+ _padding_right /**< Padding to apply to the right of the image. */
+ );
+
+ Window win;
+ auto win_last = _transform->get_window();
win.set(Window::DimX, Window::Dimension(0, win_last, 1));
INEKernel::configure(win);
}
@@ -374,22 +401,25 @@
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_workspace);
- const int element_size_in_bytes = _input_nhwc->info()->element_size();
- const int input_col_stride = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
- const int input_row_stride = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
- const int input_batch_stride = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
- const auto input_nhwc_ptr = reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes());
- auto output_ptr = reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes());
- InputTransform input_transform(input_nhwc_ptr,
- _num_batches, _num_rows, _num_cols, _num_channels, _padding,
- output_ptr,
- _matrix_stride, _num_channels, input_batch_stride, input_row_stride, input_col_stride);
+ const int element_size_in_bytes = _input_nhwc->info()->element_size();
+ const int input_col_stride = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
+ const int input_row_stride = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
+ const int input_batch_stride = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
+ const auto input_nhwc_ptr = reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes());
+ auto output_ptr = reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output_ptr);
+
+ _transform->set_input_tensor(input_nhwc_ptr, input_batch_stride, input_row_stride, input_col_stride);
+ _transform->set_output_matrices(output_ptr, _matrix_stride, _num_channels);
+
+ _transform->set_working_space(_workspace->buffer());
// The code below cannot be moved to configure because biases hasn't been allocated at that point
const size_t fst = window.x().start();
const size_t lst = window.x().end();
- input_transform.run(fst, lst);
+ _transform->run(fst, lst, info.thread_id);
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -435,11 +465,18 @@
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformOutputKernel()
- : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output_nhwc(nullptr), _num_batches(0), _num_rows(0), _num_cols(0), _num_channels(0)
+ : _transform(nullptr), _biases(nullptr), _transformed_output(nullptr), _workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output_nhwc(nullptr), _num_batches(0), _num_rows(0),
+ _num_cols(0), _num_channels(0)
{
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
+{
+ return _transform->get_working_space_size(num_threads) / sizeof(T);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
{
@@ -455,28 +492,29 @@
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
const ITensor *biases,
- const ITensor *output_workingspace,
+ const ITensor *transformed_output,
const int matrix_stride,
ITensor *output_nhwc,
const int num_batches,
const int num_rows,
const int num_cols,
- const int num_channels)
+ const int num_channels,
+ ITensor *workspace)
{
- _biases = biases;
- _output_workspace = output_workingspace;
- _matrix_stride = matrix_stride;
- _matrix_row_stride = roundup(num_channels, WinogradConv::N_BLOCK);
- _output_nhwc = output_nhwc;
- _num_batches = num_batches;
- _num_rows = num_rows;
- _num_cols = num_cols;
- _num_channels = num_channels;
+ _biases = biases;
+ _workspace = workspace;
+ _transformed_output = transformed_output;
+ _matrix_stride = matrix_stride;
+ _matrix_row_stride = roundup(num_channels, WinogradConv::N_BLOCK);
+ _output_nhwc = output_nhwc;
+ _num_batches = num_batches;
+ _num_rows = num_rows;
+ _num_cols = num_cols;
+ _num_channels = num_channels;
// We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
- OutputTransform output_transform(nullptr, _matrix_stride, _matrix_row_stride, nullptr, nullptr, _num_batches, _num_rows, _num_cols, _num_channels);
-
+ _transform = arm_compute::support::cpp14::make_unique<OutputTransform>(num_batches, num_rows, num_cols, num_channels);
Window win;
- auto win_last = output_transform.get_window();
+ auto win_last = _transform->get_window();
win.set(Window::DimX, Window::Dimension(0, win_last, 1));
_output_nhwc->info()->set_valid_region(ValidRegion(Coordinates(), _output_nhwc->info()->tensor_shape()));
@@ -488,22 +526,22 @@
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_workspace);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_transformed_output);
ARM_COMPUTE_ERROR_ON_NULLPTR(_output_nhwc);
- const int out_batch_stride = 0;
+ const int out_batch_stride = _output_nhwc->info()->strides_in_bytes()[3] / sizeof(T);
const int out_row_stride = _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T);
const int out_col_stride = _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T);
- OutputTransform output_transform(reinterpret_cast<T *>(_output_workspace->buffer()), _matrix_stride, _matrix_row_stride,
- (_biases ? reinterpret_cast<T *>(_biases->buffer() + _biases->info()->offset_first_element_in_bytes()) : nullptr),
- reinterpret_cast<T *>(_output_nhwc->buffer() + _output_nhwc->info()->offset_first_element_in_bytes()),
- _num_batches, _num_rows, _num_cols, _num_channels, out_batch_stride, out_row_stride, out_col_stride);
-
+ _transform->set_input_matrices(_transformed_output->buffer(), _matrix_stride, _matrix_row_stride);
+ _transform->set_bias((_biases ? reinterpret_cast<T *>(_biases->buffer() + _biases->info()->offset_first_element_in_bytes()) : nullptr));
+ _transform->set_output_tensor(_output_nhwc->buffer() + _output_nhwc->info()->offset_first_element_in_bytes(), out_batch_stride, out_row_stride, out_col_stride);
+ _transform->set_working_space(_workspace->buffer());
// The code below cannot be moved to configure because biases hasn't been allocated at that point
const size_t fst = window.x().start();
const size_t lst = window.x().end();
- output_transform.run(fst, lst);
+ _transform->run(fst, lst, info.thread_id);
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index b561659..0927123 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -28,49 +28,76 @@
#include "arm_gemm.hpp"
#include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
+#include "kernels/a32_sgemm_8x6.hpp"
#include "kernels/a64_hgemm_24x8.hpp"
#include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/sve_hybrid_fp16_mla_4VLx4.hpp"
#include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
+#include "kernels/sve_native_fp16_mla_4VLx4.hpp"
namespace arm_gemm {
static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
#if defined(__ARM_FEATURE_SVE)
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_fp16_mla_4VLx4",
+ [](const GemmArgs<__fp16> &args) { return (args._Ksize >= 8) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+ [](const GemmArgs<__fp16> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<__fp16> &args) { return new GemmHybrid<hybrid_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+},
+{
+ GemmMethod::GEMM_NATIVE,
+ "native_fp16_mla_4VLx4",
+ [](const GemmArgs<__fp16> &args) { return (args._Ksize >= 8 && args._alpha==1.0f && !args._trA && !args._trB); },
+ [](const GemmArgs<__fp16> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<__fp16> &args) { return new GemmNative<native_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+},
+{
GemmMethod::GEMM_INTERLEAVED,
"interleaved_fp16_mla_3VLx8",
[](const GemmArgs<__fp16> &args) { return (args._Ksize > 4); },
- [](const GemmArgs<__fp16> &args) { return true; },
+ nullptr,
[](const GemmArgs<__fp16> &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
},
#endif
+
#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
{
GemmMethod::GEMM_INTERLEAVED,
"hgemm_24x8",
- [](const GemmArgs<__fp16> &args) {
#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- return args._ci->has_fp16();
+ [](const GemmArgs<__fp16> &args) { return args._ci->has_fp16(); },
#else
- return true;
+ nullptr,
#endif
- },
- [](const GemmArgs<__fp16> &args) { return true; },
+ nullptr,
[](const GemmArgs<__fp16> &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
},
#endif
-#if defined(__arm__)
+#ifdef __aarch64__
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "sgemm_12x8",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args); }
+},
+#elif defined(__arm__)
{
GemmMethod::GEMM_INTERLEAVED,
"sgemm_8x6",
- [](const GemmArgs<__fp16> &args) { return true; },
- [](const GemmArgs<__fp16> &args) { return true; },
+ nullptr,
+ nullptr,
[](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args); }
},
+#else // not AArch64 or AArch32
+# error Unknown Architecture
#endif
{
GemmMethod::DEFAULT,
@@ -90,8 +117,8 @@
template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(const GemmArgs<__fp16> &args);
template KernelDescription get_gemm_method<__fp16, __fp16>(const GemmArgs<__fp16> &args);
template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, const GemmArgs<__fp16> &args);
-template std::vector<std::string> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
} // namespace arm_gemm
-#endif // __ARM_FP16_ARGS
\ No newline at end of file
+#endif // __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 8bc33cc..6869279 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -32,6 +32,7 @@
#include "gemv_pretransposed.hpp"
#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_hybrid_fp32_mla_16x4.hpp"
#include "kernels/a64_sgemm_12x8.hpp"
#include "kernels/a64_sgemm_native_16x4.hpp"
#include "kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp"
@@ -112,6 +113,13 @@
[](const GemmArgs<float> &args) { return new GemmHybrid<sgemm_nativeA_pretransposeB_16x4, float, float>(args); }
},
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_fp32_mla_16x4",
+ [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
+},
+{
GemmMethod::GEMM_NATIVE,
"sgemm_native_16x4",
[](const GemmArgs<float> &args) { return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB); },
@@ -165,6 +173,6 @@
template UniqueGemmCommon<float, float> gemm<float, float>(const GemmArgs<float> &args);
template KernelDescription get_gemm_method<float, float>(const GemmArgs<float> &args);
template bool method_is_compatible<float, float>(GemmMethod method, const GemmArgs<float> &args);
-template std::vector<std::string> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
+template std::vector<KernelDescription> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
-} // namespace arm_gemm
\ No newline at end of file
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index c2bd0bb..82e0625 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -208,7 +208,6 @@
return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
}
- using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
_B_transposed = buffer;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index bf80784..d952140 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -112,8 +112,12 @@
}
template<typename Top, typename Tret>
-std::vector<std::string> get_compatible_kernels(const GemmArgs<Tret> &args) {
- std::vector<std::string> res;
+std::vector<KernelDescription> get_compatible_kernels(const GemmArgs<Tret> &args) {
+ std::vector<KernelDescription> res;
+
+ /* Find out what the default implementation in so we can set the flag accordingly later. */
+ const GemmImplementation<Top, Tret> *default_impl;
+ find_implementation(args, default_impl);
auto gemms = gemm_implementation_list<Top, Tret>();
@@ -123,7 +127,7 @@
continue;
}
- res.push_back(i->name);
+ res.push_back(KernelDescription(i->method, i->name, i==default_impl));
}
return res;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index b4503dd..0db0654 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -58,7 +58,7 @@
template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const GemmArgs<int32_t> &args);
template KernelDescription get_gemm_method<int16_t, int32_t>(const GemmArgs<int32_t> &args);
template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
-template std::vector<std::string> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 5811c2a..9e49df1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,6 +34,7 @@
#include "kernels/a64_gemm_s8_12x8.hpp"
#include "kernels/a64_gemm_s8_4x4.hpp"
#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
+#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
#include "kernels/sve_native_s8s32_dot_4VLx4.hpp"
@@ -42,6 +43,13 @@
static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
#ifdef __ARM_FEATURE_SVE
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_s8s32_dot_4VLx4",
+ [](const GemmArgs<int32_t> &args) { return args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<int32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+},
+{
GemmMethod::GEMM_NATIVE,
"native_s8s32_dot_4VLx4",
[](const GemmArgs<int32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
@@ -59,7 +67,7 @@
{
GemmMethod::GEMM_HYBRID,
"hybrid_s8s32_dot_16x4",
- [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
[](const GemmArgs<int32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
[](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
},
@@ -95,7 +103,7 @@
template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const GemmArgs<int32_t> &args);
template KernelDescription get_gemm_method<int8_t, int32_t>(const GemmArgs<int32_t> &args);
template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
-template std::vector<std::string> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index b83ccd3..a773166 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -480,7 +480,6 @@
return total;
}
- using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
blockwalker current(*this);
Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 6bcbca9..9e3e4e4 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -58,7 +58,7 @@
template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
template KernelDescription get_gemm_method<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
-template std::vector<std::string> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index b95ca80..9321bfc 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,6 +34,7 @@
#include "kernels/a64_gemm_u8_12x8.hpp"
#include "kernels/a64_gemm_u8_4x4.hpp"
#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
+#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
#include "kernels/sve_native_u8u32_dot_4VLx4.hpp"
@@ -42,6 +43,13 @@
static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
#ifdef __ARM_FEATURE_SVE
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_u8u32_dot_4VLx4",
+ [](const GemmArgs<uint32_t> &args) { return args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<uint32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+},
+{
GemmMethod::GEMM_NATIVE,
"native_u8u32_dot_4VLx4",
[](const GemmArgs<uint32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
@@ -59,7 +67,7 @@
{
GemmMethod::GEMM_HYBRID,
"hybrid_u8u32_dot_16x4",
- [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
[](const GemmArgs<uint32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
[](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
},
@@ -95,7 +103,7 @@
template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
template KernelDescription get_gemm_method<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
-template std::vector<std::string> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index 32d668f..b7f9de8 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -44,10 +44,9 @@
_subgemm = gemm<To,Tr>(newargs);
}
- using GemmCommon<To, Tr>::set_arrays;
void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
const To *B, const int ldb, const int B_multi_stride,
- Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
+ Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
/* A and C's batch stride becomes their new row stride. New batch stride is 0 as nbatches for subgemm is always 1. */
_subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride,
B, ldb, B_multi_stride,
@@ -86,7 +85,6 @@
return _subgemm->get_B_pretransposed_array_size();
}
- using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
_subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index f7beb0a..21f8278 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -148,7 +148,6 @@
return _buffer_per_multi * _nmultis * sizeof(To);
}
- using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
new file mode 100644
index 0000000..5605939
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, float, int, int, int);
+void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, float, int, int, int);
+
+class hybrid_fp32_mla_16x4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_fp32_mla_16x4;
+
+ hybrid_fp32_mla_16x4(const CPUInfo *ci)
+ {
+ if (ci->get_cpu_model() == CPUModel::A55r1) {
+ kernel = a64_hybrid_fp32_mla_16x4_a55;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
new file mode 100644
index 0000000..7261761
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
@@ -0,0 +1,2352 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const int K_stride = K;
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long blocks_count = K / 1;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const float *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long blocks = blocks_count;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ float result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+ float *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "fmul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "fmul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "fmul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "fmul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "fmul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "fmul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "fmul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "fmul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v28.4s, v12.4s, v7.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "fmla v29.4s, v13.4s, v7.s[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ "fmla v30.4s, v14.4s, v7.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v7.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v28.4s, v12.4s, v7.s[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v29.4s, v13.4s, v7.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v28.4s, v12.4s, v7.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v29.4s, v13.4s, v7.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v30.4s, v14.4s, v7.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v7.s[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v28.4s, v12.4s, v7.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v29.4s, v13.4s, v7.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ );
+ break;
+ }
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
new file mode 100644
index 0000000..504769b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
@@ -0,0 +1,1726 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const int K_stride = K;
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long blocks_count = K / 1;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const float *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long blocks = blocks_count;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ float result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+ float *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "fmul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "fmul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "fmul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "fmul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "fmul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "fmul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "fmul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "fmul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "fmla v28.4s, v12.4s, v7.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "fmla v29.4s, v13.4s, v7.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "fmla v30.4s, v14.4s, v7.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "fmla v31.4s, v15.4s, v7.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v28.4s, v12.4s, v7.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v29.4s, v13.4s, v7.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "fmla v28.4s, v12.4s, v7.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "fmla v29.4s, v13.4s, v7.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "fmla v30.4s, v14.4s, v7.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "fmla v31.4s, v15.4s, v7.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v28.4s, v12.4s, v7.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v29.4s, v13.4s, v7.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
index 48bf842..17f6e57 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
@@ -37,2235 +37,2432 @@
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
for (int y=0; y<M; y+=4) {
const int8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(int8_t);
int32_t *c_ptr0 = C + (y * ldc);
- const unsigned long ldcb = ldc * sizeof(int32_t);
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
const int32_t *betaptr = β
long loops = loops_count;
long regs = regs_count;
+ long blocks = blocks_count;
+ long odds = odds_count;
const int8_t *a_ptr0 = a_ptr0_base;
const int8_t *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ int32_t result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
+ int32_t *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
switch(M-y) {
case 1:
__asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
case 2:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
case 3:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ins v15.d[1], temploadreg3\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d2, [a_ptr2, #0x10]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #0x18]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
);
break;
default:
case 4:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v28.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v29.4s, #0\n"
- "ins v14.d[1], temploadreg2\n"
- "movi v30.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v31.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q28, [c_ptr3]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "mul v28.4s, v28.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v29.4s, v29.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v30.4s, v30.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v31.4s, v31.4s, v15.4s\n"
- "ldr q3, [a_ptr3]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v2.d[1], temploadreg2\n"
- ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v3.d[1], temploadreg3\n"
- ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d2, [a_ptr2, #0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x18]\n"
- ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr d3, [a_ptr3, #0x10]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x18]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v2.d[1], temploadreg2\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v3.d[1], temploadreg3\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "ld1 {v3.b}[0], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "ld1 {v3.b}[1], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "ld1 {v3.b}[2], [a_ptr3]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
);
break;
}
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
}
}
}
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
index 0179139..fdd45a0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
@@ -37,1569 +37,1806 @@
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
for (int y=0; y<M; y+=4) {
const int8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(int8_t);
int32_t *c_ptr0 = C + (y * ldc);
- const unsigned long ldcb = ldc * sizeof(int32_t);
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
const int32_t *betaptr = β
long loops = loops_count;
long regs = regs_count;
+ long blocks = blocks_count;
+ long odds = odds_count;
const int8_t *a_ptr0 = a_ptr0_base;
const int8_t *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ int32_t result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
+ int32_t *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
switch(M-y) {
case 1:
__asm __volatile (
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
);
break;
case 2:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
);
break;
case 3:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
default:
case 4:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v28.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v29.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "movi v30.4s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "movi v31.4s, #0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q28, [c_ptr3]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "mul v28.4s, v28.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v29.4s, v29.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v30.4s, v30.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v31.4s, v31.4s, v15.4s\n"
- "ldr q3, [a_ptr3]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #0x10]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "ld1 {v3.b}[0], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "ld1 {v3.b}[1], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "ld1 {v3.b}[2], [a_ptr3]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
}
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
}
}
}
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
index 230ecdc..487cfa0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
@@ -37,2235 +37,2432 @@
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
for (int y=0; y<M; y+=4) {
const uint8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(uint8_t);
uint32_t *c_ptr0 = C + (y * ldc);
- const unsigned long ldcb = ldc * sizeof(uint32_t);
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
const uint32_t *betaptr = β
long loops = loops_count;
long regs = regs_count;
+ long blocks = blocks_count;
+ long odds = odds_count;
const uint8_t *a_ptr0 = a_ptr0_base;
const uint8_t *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ uint32_t result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
+ uint32_t *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
switch(M-y) {
case 1:
__asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
case 2:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
case 3:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ins v15.d[1], temploadreg3\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d2, [a_ptr2, #0x10]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #0x18]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
);
break;
default:
case 4:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v28.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v29.4s, #0\n"
- "ins v14.d[1], temploadreg2\n"
- "movi v30.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v31.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q28, [c_ptr3]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "mul v28.4s, v28.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v29.4s, v29.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v30.4s, v30.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v31.4s, v31.4s, v15.4s\n"
- "ldr q3, [a_ptr3]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v2.d[1], temploadreg2\n"
- ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v3.d[1], temploadreg3\n"
- ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d2, [a_ptr2, #0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x18]\n"
- ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr d3, [a_ptr3, #0x10]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x18]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v2.d[1], temploadreg2\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v3.d[1], temploadreg3\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "ld1 {v3.b}[0], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "ld1 {v3.b}[1], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "ld1 {v3.b}[2], [a_ptr3]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
);
break;
}
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
}
}
}
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
index dbef029..87f46bb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
@@ -37,1569 +37,1806 @@
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
for (int y=0; y<M; y+=4) {
const uint8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(uint8_t);
uint32_t *c_ptr0 = C + (y * ldc);
- const unsigned long ldcb = ldc * sizeof(uint32_t);
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
const uint32_t *betaptr = β
long loops = loops_count;
long regs = regs_count;
+ long blocks = blocks_count;
+ long odds = odds_count;
const uint8_t *a_ptr0 = a_ptr0_base;
const uint8_t *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ uint32_t result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
+ uint32_t *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
switch(M-y) {
case 1:
__asm __volatile (
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
);
break;
case 2:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
);
break;
case 3:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
default:
case 4:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v28.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v29.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "movi v30.4s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "movi v31.4s, #0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q28, [c_ptr3]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "mul v28.4s, v28.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v29.4s, v29.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v30.4s, v30.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v31.4s, v31.4s, v15.4s\n"
- "ldr q3, [a_ptr3]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #0x10]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "ld1 {v3.b}[0], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "ld1 {v3.b}[1], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "ld1 {v3.b}[2], [a_ptr3]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
}
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
}
}
}
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
new file mode 100644
index 0000000..c6895a6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, __fp16 *, int, __fp16, int, int, int);
+
+class hybrid_fp16_mla_4VLx4
+{
+public:
+ typedef __fp16 operand_type;
+ typedef __fp16 result_type;
+
+ typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, __fp16 *, int, __fp16, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<__fp16>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
+
+ hybrid_fp16_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..ab41fb3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
@@ -0,0 +1,3681 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, __fp16 beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const int K_stride = K;
+ const long loops_count = ((K + 8) / 16) - 1;
+ K -= loops_count * 16;
+ const long regs_count = (K / 8) - 1;
+ K -= (regs_count + 1) * 8;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const __fp16 * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(__fp16);
+
+ __fp16 *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(__fp16);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
+ const __fp16 *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const __fp16 *a_ptr0 = a_ptr0_base;
+ const __fp16 *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "mov z18.h, #0\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.h, #0\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "mov z19.h, #0\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.h, #0\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.h, #0\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "mov z19.h, #0\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "mov z20.h, #0\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.h, #0\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z24.h, #0\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z25.h, #0\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z26.h, #0\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z27.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1h z24.h, p0/z, [c_ptr2]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z24.h, p7/m, z24.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z25.h, p7/m, z25.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z26.h, p7/m, z26.h, z15.h\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "fmul z27.h, p7/m, z27.h, z15.h\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1h z24.h, p0, [c_ptr2]\n"
+ "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "mov z19.h, #0\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+ "mov z20.h, #0\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "mov z21.h, #0\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.h, #0\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z25.h, #0\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z26.h, #0\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z27.h, #0\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z28.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z29.h, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z30.h, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z31.h, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1h z24.h, p0/z, [c_ptr2]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z24.h, p7/m, z24.h, z15.h\n"
+ "ld1h z28.h, p0/z, [c_ptr3]\n"
+ "fmul z25.h, p7/m, z25.h, z15.h\n"
+ "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z26.h, p7/m, z26.h, z15.h\n"
+ "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z27.h, p7/m, z27.h, z15.h\n"
+ "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z28.h, p7/m, z28.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z29.h, p7/m, z29.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z30.h, p7/m, z30.h, z15.h\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "fmul z31.h, p7/m, z31.h, z15.h\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z28.h, z12.h, z7.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z29.h, z13.h, z7.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z30.h, z14.h, z7.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "fmla z31.h, z15.h, z7.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z28.h, z12.h, z7.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z29.h, z13.h, z7.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z30.h, z14.h, z7.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "fmla z31.h, z15.h, z7.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1h z24.h, p0, [c_ptr2]\n"
+ "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1h z28.h, p0, [c_ptr3]\n"
+ "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
new file mode 100644
index 0000000..ffd7918
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+class hybrid_s8s32_dot_4VLx4
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<int32_t>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
+
+ hybrid_s8s32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..673f186
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
@@ -0,0 +1,2150 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = (K + 3) / 4;
+
+ for (int y=0; y<M; y+=4) {
+ const int8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(int8_t);
+
+ int32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(int32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
+ const int32_t *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ const int8_t *a_ptr0 = a_ptr0_base;
+ const int8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z28.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z29.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z30.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z31.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
new file mode 100644
index 0000000..2701a9e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+class hybrid_u8u32_dot_4VLx4
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<uint32_t>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
+
+ hybrid_u8u32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..d34d0e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
@@ -0,0 +1,2150 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0u);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = (K + 3) / 4;
+
+ for (int y=0; y<M; y+=4) {
+ const uint8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(uint8_t);
+
+ uint32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
+ const uint32_t *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ const uint8_t *a_ptr0 = a_ptr0_base;
+ const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z28.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z29.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z30.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z31.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
index f4d33a9..8228df4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
new file mode 100644
index 0000000..6cce601
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, __fp16, int, int, int);
+
+class native_fp16_mla_4VLx4
+{
+public:
+ typedef __fp16 operand_type;
+ typedef __fp16 result_type;
+
+ typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, __fp16, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<__fp16>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_fp16_mla_4VLx4;
+
+ native_fp16_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..f1aaeb1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
@@ -0,0 +1,3821 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ldb, __fp16 *C, int ldc, __fp16 beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const long loops_count = ((K + 8) / 16) - 1;
+ K -= loops_count * 16;
+ const long regs_count = (K / 8) - 1;
+ K -= (regs_count + 1) * 8;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const __fp16 * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(__fp16);
+
+ __fp16 *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(__fp16);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
+ const __fp16 *betaptr = β
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const __fp16 *a_ptr0 = a_ptr0_base;
+ const __fp16 *b_ptr0 = B + x0;
+ long ldbb = ldb * sizeof(__fp16);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "mov z18.h, #0\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.h, #0\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "mov z19.h, #0\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.h, #0\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.h, #0\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.h, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "mov z19.h, #0\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "mov z20.h, #0\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.h, #0\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.h, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z25.h, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z26.h, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z27.h, #0\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1h z24.h, p0/z, [c_ptr2]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z24.h, p7/m, z24.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z25.h, p7/m, z25.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z26.h, p7/m, z26.h, z15.h\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "fmul z27.h, p7/m, z27.h, z15.h\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1h z24.h, p0, [c_ptr2]\n"
+ "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "mov z19.h, #0\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+ "mov z20.h, #0\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "mov z21.h, #0\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z25.h, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z26.h, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z27.h, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z28.h, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z29.h, #0\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "mov z30.h, #0\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z31.h, #0\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1h z24.h, p0/z, [c_ptr2]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z24.h, p7/m, z24.h, z15.h\n"
+ "ld1h z28.h, p0/z, [c_ptr3]\n"
+ "fmul z25.h, p7/m, z25.h, z15.h\n"
+ "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z26.h, p7/m, z26.h, z15.h\n"
+ "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z27.h, p7/m, z27.h, z15.h\n"
+ "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z28.h, p7/m, z28.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z29.h, p7/m, z29.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z30.h, p7/m, z30.h, z15.h\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "fmul z31.h, p7/m, z31.h, z15.h\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z28.h, z12.h, z7.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z29.h, z13.h, z7.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z30.h, z14.h, z7.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "fmla z31.h, z15.h, z7.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z28.h, z12.h, z7.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z29.h, z13.h, z7.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z30.h, z14.h, z7.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "fmla z31.h, z15.h, z7.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1h z24.h, p0, [c_ptr2]\n"
+ "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1h z28.h, p0, [c_ptr3]\n"
+ "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
index 9c02d95..abee1bb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,63 +86,73 @@
"mov z19.s, #0\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "b 2f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "2:\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "cbz %[loops], 3f\n"
- "4:\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"zip1 z10.b, z10.b, z8.b\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "zip1 z13.b, z12.b, z13.b\n"
"sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip2 z13.b, z13.b, z14.b\n"
- "subs %[loops], %[loops], #0x1\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -150,137 +160,137 @@
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z0.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z4.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z4.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z4.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z4.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "b.ne 4b\n"
- "3:\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
@@ -289,118 +299,118 @@
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z0.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z4.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z4.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
@@ -427,15 +437,15 @@
"sdot z17.s, z13.b, z4.b[3]\n"
"sdot z18.s, z14.b, z4.b[3]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -449,15 +459,15 @@
"sdot z17.s, z9.b, z0.b[0]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -471,13 +481,13 @@
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -492,31 +502,31 @@
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 10f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -530,33 +540,33 @@
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 13f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -570,33 +580,33 @@
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 16f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -610,33 +620,33 @@
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 19f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -650,38 +660,38 @@
"sdot z17.s, z9.b, z0.b[0]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
@@ -708,15 +718,15 @@
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -730,15 +740,15 @@
"sdot z17.s, z9.b, z4.b[0]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -752,13 +762,13 @@
"sdot z17.s, z13.b, z4.b[1]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -773,31 +783,31 @@
"sdot z17.s, z9.b, z4.b[2]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 25f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -811,33 +821,33 @@
"sdot z17.s, z13.b, z4.b[3]\n"
"sdot z18.s, z14.b, z4.b[3]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 28f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -851,33 +861,33 @@
"sdot z17.s, z9.b, z4.b[2]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 31f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -891,33 +901,33 @@
"sdot z17.s, z13.b, z4.b[1]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 34f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -931,14 +941,14 @@
"sdot z17.s, z9.b, z4.b[0]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
"st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #4\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
);
break;
@@ -971,103 +981,108 @@
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z21.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z22.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z22.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z23.s, #0\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 2f\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
"add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "2:\n"
- "cbz %[loops], 3f\n"
- "4:\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
"sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip1 z14.b, z15.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr1, a_ptr1, #0x20\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
"sdot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"sdot z20.s, z12.b, z1.b[1]\n"
- "subs %[loops], %[loops], #0x1\n"
"zip2 z12.b, z10.b, z8.b\n"
"zip1 z10.b, z10.b, z8.b\n"
"sdot z17.s, z13.b, z0.b[1]\n"
@@ -1092,148 +1107,148 @@
"sdot z16.s, z8.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z21.s, z13.b, z1.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
"sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"sdot z22.s, z14.b, z1.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
"ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip2 z9.b, z9.b, z10.b\n"
- "zip1 z10.b, z11.b, z12.b\n"
- "zip2 z11.b, z11.b, z12.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
"sdot z21.s, z9.b, z5.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
"sdot z21.s, z13.b, z5.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
"sdot z21.s, z13.b, z5.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[3]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -1245,13 +1260,13 @@
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -1261,142 +1276,142 @@
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
"sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
"sdot z21.s, z13.b, z1.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"sdot z22.s, z14.b, z1.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
"ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "zip1 z8.b, z9.b, z10.b\n"
- "zip2 z9.b, z9.b, z10.b\n"
- "zip1 z10.b, z11.b, z12.b\n"
- "zip2 z11.b, z11.b, z12.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
"sdot z21.s, z9.b, z5.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
"sdot z21.s, z13.b, z5.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
"sdot z16.s, z12.b, z4.b[3]\n"
"sdot z20.s, z12.b, z5.b[3]\n"
"sdot z17.s, z13.b, z4.b[3]\n"
@@ -1405,15 +1420,15 @@
"sdot z22.s, z14.b, z5.b[3]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1431,15 +1446,15 @@
"sdot z22.s, z10.b, z1.b[0]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1457,13 +1472,13 @@
"sdot z22.s, z14.b, z1.b[1]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"sdot z23.s, z15.b, z1.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -1482,31 +1497,31 @@
"sdot z22.s, z10.b, z1.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 10f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1524,33 +1539,33 @@
"sdot z22.s, z14.b, z1.b[3]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 13f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1568,33 +1583,33 @@
"sdot z22.s, z10.b, z1.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 16f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1612,33 +1627,33 @@
"sdot z22.s, z14.b, z1.b[1]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"sdot z23.s, z15.b, z1.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 19f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1656,8 +1671,8 @@
"sdot z22.s, z10.b, z1.b[0]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -1669,13 +1684,13 @@
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -1685,44 +1700,44 @@
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
"sdot z16.s, z12.b, z0.b[3]\n"
"sdot z20.s, z12.b, z1.b[3]\n"
"sdot z17.s, z13.b, z0.b[3]\n"
@@ -1731,15 +1746,15 @@
"sdot z22.s, z14.b, z1.b[3]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1757,15 +1772,15 @@
"sdot z22.s, z10.b, z5.b[0]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
"sdot z23.s, z11.b, z5.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1783,13 +1798,13 @@
"sdot z22.s, z14.b, z5.b[1]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
"sdot z23.s, z15.b, z5.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -1808,31 +1823,31 @@
"sdot z22.s, z10.b, z5.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 25f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1850,33 +1865,33 @@
"sdot z22.s, z14.b, z5.b[3]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 28f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1894,33 +1909,33 @@
"sdot z22.s, z10.b, z5.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 31f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1938,33 +1953,33 @@
"sdot z22.s, z14.b, z5.b[1]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
"sdot z23.s, z15.b, z5.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 34f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1982,7 +1997,7 @@
"sdot z22.s, z10.b, z5.b[0]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
"sdot z23.s, z11.b, z5.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -1995,7 +2010,7 @@
".unreq a_ptr1\n"
".unreq c_ptr1\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
);
break;
@@ -2007,11 +2022,11 @@
"c_ptr2 .req X3\n"
"add a_ptr1, %[a_ptr0], %[lda]\n"
"add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
"whilelt p6.b, %[temp], %[leftovers]\n"
"whilelt p0.s, %[temp], %[width]\n"
"whilelt p4.b, %[temp], %[width]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
"incw %[temp], all, mul #1\n"
"ptrue p7.b\n"
"whilelt p1.s, %[temp], %[width]\n"
@@ -2034,116 +2049,122 @@
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z22.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z23.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"mov z24.s, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "mov z25.s, #0\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "mov z26.s, #0\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"mov z27.s, #0\n"
- "b 2f\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
"mul z24.s, p7/m, z24.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z25.s, p7/m, z25.s, z15.s\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z26.s, p7/m, z26.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
"mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "2:\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "cbz %[loops], 3f\n"
- "4:\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip1 z14.b, z15.b, z8.b\n"
"add a_ptr1, a_ptr1, #0x20\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
"sdot z27.s, z11.b, z2.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2157,15 +2178,15 @@
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
@@ -2185,15 +2206,15 @@
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
@@ -2213,15 +2234,15 @@
"sdot z21.s, z13.b, z1.b[3]\n"
"sdot z25.s, z13.b, z2.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
"sdot z26.s, z14.b, z2.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
@@ -2244,15 +2265,15 @@
"sdot z21.s, z9.b, z5.b[0]\n"
"sdot z25.s, z9.b, z6.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
"sdot z26.s, z10.b, z6.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
@@ -2272,15 +2293,15 @@
"sdot z21.s, z13.b, z5.b[1]\n"
"sdot z25.s, z13.b, z6.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
"sdot z26.s, z14.b, z6.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
@@ -2300,15 +2321,15 @@
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z25.s, z9.b, z6.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
"sdot z26.s, z10.b, z6.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
@@ -2328,23 +2349,23 @@
"sdot z21.s, z13.b, z5.b[3]\n"
"sdot z25.s, z13.b, z6.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
"sdot z22.s, z14.b, z5.b[3]\n"
"sdot z26.s, z14.b, z6.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -2353,24 +2374,24 @@
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
@@ -2390,15 +2411,15 @@
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
@@ -2418,15 +2439,15 @@
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
@@ -2446,15 +2467,15 @@
"sdot z21.s, z13.b, z1.b[3]\n"
"sdot z25.s, z13.b, z2.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
"sdot z26.s, z14.b, z2.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
@@ -2477,15 +2498,15 @@
"sdot z21.s, z9.b, z5.b[0]\n"
"sdot z25.s, z9.b, z6.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
"sdot z26.s, z10.b, z6.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
@@ -2505,12 +2526,12 @@
"sdot z21.s, z13.b, z5.b[1]\n"
"sdot z25.s, z13.b, z6.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
"sdot z22.s, z14.b, z5.b[1]\n"
"sdot z26.s, z14.b, z6.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2523,17 +2544,17 @@
"sdot z16.s, z8.b, z4.b[2]\n"
"sdot z20.s, z8.b, z5.b[2]\n"
"sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
"sdot z26.s, z10.b, z6.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
@@ -2550,15 +2571,15 @@
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2580,15 +2601,15 @@
"sdot z19.s, z11.b, z0.b[0]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
"sdot z27.s, z11.b, z2.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2610,13 +2631,13 @@
"sdot z19.s, z15.b, z0.b[1]\n"
"sdot z23.s, z15.b, z1.b[1]\n"
"sdot z27.s, z15.b, z2.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -2639,31 +2660,31 @@
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
"sdot z27.s, z11.b, z2.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 10f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -2685,33 +2706,33 @@
"sdot z19.s, z15.b, z0.b[3]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
"sdot z27.s, z15.b, z2.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 13f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -2733,33 +2754,33 @@
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
"sdot z27.s, z11.b, z2.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 16f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -2781,33 +2802,33 @@
"sdot z19.s, z15.b, z0.b[1]\n"
"sdot z23.s, z15.b, z1.b[1]\n"
"sdot z27.s, z15.b, z2.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 19f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -2829,8 +2850,8 @@
"sdot z19.s, z11.b, z0.b[0]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
"sdot z27.s, z11.b, z2.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -2839,24 +2860,24 @@
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
@@ -2876,12 +2897,12 @@
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
"sdot z22.s, z14.b, z1.b[1]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2894,17 +2915,17 @@
"sdot z16.s, z8.b, z0.b[2]\n"
"sdot z20.s, z8.b, z1.b[2]\n"
"sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
@@ -2921,15 +2942,15 @@
"sdot z19.s, z15.b, z0.b[3]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
"sdot z27.s, z15.b, z2.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2951,15 +2972,15 @@
"sdot z19.s, z11.b, z4.b[0]\n"
"sdot z23.s, z11.b, z5.b[0]\n"
"sdot z27.s, z11.b, z6.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2981,13 +3002,13 @@
"sdot z19.s, z15.b, z4.b[1]\n"
"sdot z23.s, z15.b, z5.b[1]\n"
"sdot z27.s, z15.b, z6.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3010,31 +3031,31 @@
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
"sdot z27.s, z11.b, z6.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 25f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -3056,33 +3077,33 @@
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 28f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -3104,33 +3125,33 @@
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
"sdot z27.s, z11.b, z6.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 31f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -3152,33 +3173,33 @@
"sdot z19.s, z15.b, z4.b[1]\n"
"sdot z23.s, z15.b, z5.b[1]\n"
"sdot z27.s, z15.b, z6.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 34f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -3200,7 +3221,7 @@
"sdot z19.s, z11.b, z4.b[0]\n"
"sdot z23.s, z11.b, z5.b[0]\n"
"sdot z27.s, z11.b, z6.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -3219,7 +3240,7 @@
".unreq c_ptr1\n"
".unreq c_ptr2\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
@@ -3234,15 +3255,15 @@
"c_ptr3 .req X5\n"
"add a_ptr1, %[a_ptr0], %[lda]\n"
"add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
"whilelt p6.b, %[temp], %[leftovers]\n"
"whilelt p0.s, %[temp], %[width]\n"
"whilelt p4.b, %[temp], %[width]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
"incw %[temp], all, mul #1\n"
"ptrue p7.b\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
"whilelt p1.s, %[temp], %[width]\n"
"incw %[temp], all, mul #1\n"
"whilelt p2.s, %[temp], %[width]\n"
@@ -3265,77 +3286,80 @@
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z23.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z24.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"mov z25.s, #0\n"
"add a_ptr3, a_ptr3, #0x10\n"
- "mov z26.s, #0\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "mov z27.s, #0\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "mov z28.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "mov z26.s, #0\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z9.b, z9.b, z10.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z28.s, #0\n"
"mov z29.s, #0\n"
"mov z30.s, #0\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
"mov z31.s, #0\n"
- "b 2f\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
"mul z24.s, p7/m, z24.s, z15.s\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
"mul z25.s, p7/m, z25.s, z15.s\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
"mul z26.s, p7/m, z26.s, z15.s\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
"mul z27.s, p7/m, z27.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
"mul z28.s, p7/m, z28.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z29.s, p7/m, z29.s, z15.s\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z30.s, p7/m, z30.s, z15.s\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
"mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add a_ptr2, a_ptr2, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
"add a_ptr3, a_ptr3, #0x10\n"
@@ -3344,21 +3368,20 @@
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "2:\n"
- "cbz %[loops], 3f\n"
- "4:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -3367,38 +3390,38 @@
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"sdot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "add a_ptr1, a_ptr1, #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
- "add a_ptr2, a_ptr2, #0x20\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"sdot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
"sdot z26.s, z10.b, z2.b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"sdot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
"sdot z27.s, z11.b, z2.b[0]\n"
"sdot z31.s, z11.b, z3.b[0]\n"
"zip2 z11.b, z8.b, z9.b\n"
@@ -3414,17 +3437,17 @@
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"sdot z30.s, z14.b, z3.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3446,17 +3469,17 @@
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"sdot z30.s, z10.b, z3.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3478,17 +3501,17 @@
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z21.s, z13.b, z1.b[3]\n"
"sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z2.b[3]\n"
"sdot z30.s, z14.b, z3.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3514,17 +3537,17 @@
"sdot z17.s, z9.b, z4.b[0]\n"
"sdot z21.s, z9.b, z5.b[0]\n"
"sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z6.b[0]\n"
"sdot z30.s, z10.b, z7.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3546,17 +3569,17 @@
"sdot z17.s, z13.b, z4.b[1]\n"
"sdot z21.s, z13.b, z5.b[1]\n"
"sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z6.b[1]\n"
"sdot z30.s, z14.b, z7.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3578,17 +3601,17 @@
"sdot z17.s, z9.b, z4.b[2]\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z6.b[2]\n"
"sdot z30.s, z10.b, z7.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3610,13 +3633,13 @@
"sdot z17.s, z13.b, z4.b[3]\n"
"sdot z21.s, z13.b, z5.b[3]\n"
"sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z29.s, z13.b, z7.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[3]\n"
"sdot z22.s, z14.b, z5.b[3]\n"
"sdot z26.s, z14.b, z6.b[3]\n"
@@ -3626,11 +3649,11 @@
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
"sdot z31.s, z15.b, z7.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -3639,27 +3662,27 @@
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"sdot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
"sdot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"sdot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3681,17 +3704,17 @@
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"sdot z30.s, z14.b, z3.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3713,17 +3736,17 @@
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"sdot z30.s, z10.b, z3.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3745,17 +3768,17 @@
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z21.s, z13.b, z1.b[3]\n"
"sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z2.b[3]\n"
"sdot z30.s, z14.b, z3.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3781,17 +3804,17 @@
"sdot z17.s, z9.b, z4.b[0]\n"
"sdot z21.s, z9.b, z5.b[0]\n"
"sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z6.b[0]\n"
"sdot z30.s, z10.b, z7.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3813,13 +3836,13 @@
"sdot z17.s, z13.b, z4.b[1]\n"
"sdot z21.s, z13.b, z5.b[1]\n"
"sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
"sdot z22.s, z14.b, z5.b[1]\n"
"sdot z26.s, z14.b, z6.b[1]\n"
@@ -3841,11 +3864,11 @@
"sdot z17.s, z9.b, z4.b[2]\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"sdot z22.s, z10.b, z5.b[2]\n"
"sdot z26.s, z10.b, z6.b[2]\n"
@@ -3870,15 +3893,15 @@
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
"sdot z31.s, z15.b, z7.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -3904,15 +3927,15 @@
"sdot z23.s, z11.b, z1.b[0]\n"
"sdot z27.s, z11.b, z2.b[0]\n"
"sdot z31.s, z11.b, z3.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -3938,13 +3961,13 @@
"sdot z23.s, z15.b, z1.b[1]\n"
"sdot z27.s, z15.b, z2.b[1]\n"
"sdot z31.s, z15.b, z3.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3971,31 +3994,31 @@
"sdot z23.s, z11.b, z1.b[2]\n"
"sdot z27.s, z11.b, z2.b[2]\n"
"sdot z31.s, z11.b, z3.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 10f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4021,33 +4044,33 @@
"sdot z23.s, z15.b, z1.b[3]\n"
"sdot z27.s, z15.b, z2.b[3]\n"
"sdot z31.s, z15.b, z3.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 13f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4073,33 +4096,33 @@
"sdot z23.s, z11.b, z1.b[2]\n"
"sdot z27.s, z11.b, z2.b[2]\n"
"sdot z31.s, z11.b, z3.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 16f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4125,33 +4148,33 @@
"sdot z23.s, z15.b, z1.b[1]\n"
"sdot z27.s, z15.b, z2.b[1]\n"
"sdot z31.s, z15.b, z3.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 19f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4177,8 +4200,8 @@
"sdot z23.s, z11.b, z1.b[0]\n"
"sdot z27.s, z11.b, z2.b[0]\n"
"sdot z31.s, z11.b, z3.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -4187,27 +4210,27 @@
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
"sdot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
"sdot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"sdot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -4229,13 +4252,13 @@
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"sdot z22.s, z14.b, z1.b[1]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
@@ -4257,11 +4280,11 @@
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"sdot z22.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
@@ -4286,15 +4309,15 @@
"sdot z23.s, z15.b, z1.b[3]\n"
"sdot z27.s, z15.b, z2.b[3]\n"
"sdot z31.s, z15.b, z3.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -4320,15 +4343,15 @@
"sdot z23.s, z11.b, z5.b[0]\n"
"sdot z27.s, z11.b, z6.b[0]\n"
"sdot z31.s, z11.b, z7.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -4354,13 +4377,13 @@
"sdot z23.s, z15.b, z5.b[1]\n"
"sdot z27.s, z15.b, z6.b[1]\n"
"sdot z31.s, z15.b, z7.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -4387,31 +4410,31 @@
"sdot z23.s, z11.b, z5.b[2]\n"
"sdot z27.s, z11.b, z6.b[2]\n"
"sdot z31.s, z11.b, z7.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 25f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4437,33 +4460,33 @@
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
"sdot z31.s, z15.b, z7.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 28f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4489,33 +4512,33 @@
"sdot z23.s, z11.b, z5.b[2]\n"
"sdot z27.s, z11.b, z6.b[2]\n"
"sdot z31.s, z11.b, z7.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 31f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4541,33 +4564,33 @@
"sdot z23.s, z15.b, z5.b[1]\n"
"sdot z27.s, z15.b, z6.b[1]\n"
"sdot z31.s, z15.b, z7.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 34f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4593,7 +4616,7 @@
"sdot z23.s, z11.b, z5.b[0]\n"
"sdot z27.s, z11.b, z6.b[0]\n"
"sdot z31.s, z11.b, z7.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -4618,7 +4641,7 @@
".unreq c_ptr2\n"
".unreq c_ptr3\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
index 7d89948..cdcea59 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,63 +86,73 @@
"mov z19.s, #0\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "b 2f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "2:\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "cbz %[loops], 3f\n"
- "4:\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"zip1 z10.b, z10.b, z8.b\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "zip1 z13.b, z12.b, z13.b\n"
"udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip2 z13.b, z13.b, z14.b\n"
- "subs %[loops], %[loops], #0x1\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -150,137 +160,137 @@
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z0.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z4.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z18.s, z10.b, z4.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z4.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z4.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z4.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "b.ne 4b\n"
- "3:\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
@@ -289,118 +299,118 @@
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z0.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z4.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z4.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
@@ -427,15 +437,15 @@
"udot z17.s, z13.b, z4.b[3]\n"
"udot z18.s, z14.b, z4.b[3]\n"
"udot z19.s, z15.b, z4.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -449,15 +459,15 @@
"udot z17.s, z9.b, z0.b[0]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"udot z19.s, z11.b, z0.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -471,13 +481,13 @@
"udot z17.s, z13.b, z0.b[1]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"udot z19.s, z15.b, z0.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -492,31 +502,31 @@
"udot z17.s, z9.b, z0.b[2]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 10f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -530,33 +540,33 @@
"udot z17.s, z13.b, z0.b[3]\n"
"udot z18.s, z14.b, z0.b[3]\n"
"udot z19.s, z15.b, z0.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 13f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -570,33 +580,33 @@
"udot z17.s, z9.b, z0.b[2]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 16f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -610,33 +620,33 @@
"udot z17.s, z13.b, z0.b[1]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"udot z19.s, z15.b, z0.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 19f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -650,38 +660,38 @@
"udot z17.s, z9.b, z0.b[0]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"udot z19.s, z11.b, z0.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
@@ -708,15 +718,15 @@
"udot z17.s, z13.b, z0.b[3]\n"
"udot z18.s, z14.b, z0.b[3]\n"
"udot z19.s, z15.b, z0.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -730,15 +740,15 @@
"udot z17.s, z9.b, z4.b[0]\n"
"udot z18.s, z10.b, z4.b[0]\n"
"udot z19.s, z11.b, z4.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -752,13 +762,13 @@
"udot z17.s, z13.b, z4.b[1]\n"
"udot z18.s, z14.b, z4.b[1]\n"
"udot z19.s, z15.b, z4.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -773,31 +783,31 @@
"udot z17.s, z9.b, z4.b[2]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 25f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -811,33 +821,33 @@
"udot z17.s, z13.b, z4.b[3]\n"
"udot z18.s, z14.b, z4.b[3]\n"
"udot z19.s, z15.b, z4.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 28f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -851,33 +861,33 @@
"udot z17.s, z9.b, z4.b[2]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 31f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -891,33 +901,33 @@
"udot z17.s, z13.b, z4.b[1]\n"
"udot z18.s, z14.b, z4.b[1]\n"
"udot z19.s, z15.b, z4.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 34f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -931,14 +941,14 @@
"udot z17.s, z9.b, z4.b[0]\n"
"udot z18.s, z10.b, z4.b[0]\n"
"udot z19.s, z11.b, z4.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
"st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #4\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
);
break;
@@ -971,103 +981,108 @@
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z21.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z22.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z22.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z23.s, #0\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 2f\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
"add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "2:\n"
- "cbz %[loops], 3f\n"
- "4:\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"udot z16.s, z8.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
"udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip1 z14.b, z15.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr1, a_ptr1, #0x20\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
"udot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"udot z20.s, z12.b, z1.b[1]\n"
- "subs %[loops], %[loops], #0x1\n"
"zip2 z12.b, z10.b, z8.b\n"
"zip1 z10.b, z10.b, z8.b\n"
"udot z17.s, z13.b, z0.b[1]\n"
@@ -1092,148 +1107,148 @@
"udot z16.s, z8.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"udot z17.s, z13.b, z0.b[3]\n"
"udot z21.s, z13.b, z1.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
"udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"udot z22.s, z14.b, z1.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
"udot z23.s, z15.b, z1.b[3]\n"
"ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip2 z9.b, z9.b, z10.b\n"
- "zip1 z10.b, z11.b, z12.b\n"
- "zip2 z11.b, z11.b, z12.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
"udot z21.s, z9.b, z5.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
"udot z21.s, z13.b, z5.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
"udot z21.s, z9.b, z5.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[2]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
"udot z21.s, z13.b, z5.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[3]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -1245,13 +1260,13 @@
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -1261,142 +1276,142 @@
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
"udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
"udot z21.s, z13.b, z1.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"udot z22.s, z14.b, z1.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"udot z23.s, z15.b, z1.b[3]\n"
"ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "zip1 z8.b, z9.b, z10.b\n"
- "zip2 z9.b, z9.b, z10.b\n"
- "zip1 z10.b, z11.b, z12.b\n"
- "zip2 z11.b, z11.b, z12.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
"udot z21.s, z9.b, z5.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
"udot z21.s, z13.b, z5.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"udot z21.s, z9.b, z5.b[2]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"udot z22.s, z10.b, z5.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "udot z23.s, z11.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
"udot z16.s, z12.b, z4.b[3]\n"
"udot z20.s, z12.b, z5.b[3]\n"
"udot z17.s, z13.b, z4.b[3]\n"
@@ -1405,15 +1420,15 @@
"udot z22.s, z14.b, z5.b[3]\n"
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1431,15 +1446,15 @@
"udot z22.s, z10.b, z1.b[0]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1457,13 +1472,13 @@
"udot z22.s, z14.b, z1.b[1]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"udot z23.s, z15.b, z1.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -1482,31 +1497,31 @@
"udot z22.s, z10.b, z1.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 10f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1524,33 +1539,33 @@
"udot z22.s, z14.b, z1.b[3]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"udot z23.s, z15.b, z1.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 13f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1568,33 +1583,33 @@
"udot z22.s, z10.b, z1.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 16f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1612,33 +1627,33 @@
"udot z22.s, z14.b, z1.b[1]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"udot z23.s, z15.b, z1.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 19f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1656,8 +1671,8 @@
"udot z22.s, z10.b, z1.b[0]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -1669,13 +1684,13 @@
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -1685,44 +1700,44 @@
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"udot z22.s, z10.b, z1.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "udot z23.s, z11.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
"udot z16.s, z12.b, z0.b[3]\n"
"udot z20.s, z12.b, z1.b[3]\n"
"udot z17.s, z13.b, z0.b[3]\n"
@@ -1731,15 +1746,15 @@
"udot z22.s, z14.b, z1.b[3]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"udot z23.s, z15.b, z1.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1757,15 +1772,15 @@
"udot z22.s, z10.b, z5.b[0]\n"
"udot z19.s, z11.b, z4.b[0]\n"
"udot z23.s, z11.b, z5.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1783,13 +1798,13 @@
"udot z22.s, z14.b, z5.b[1]\n"
"udot z19.s, z15.b, z4.b[1]\n"
"udot z23.s, z15.b, z5.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -1808,31 +1823,31 @@
"udot z22.s, z10.b, z5.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 25f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1850,33 +1865,33 @@
"udot z22.s, z14.b, z5.b[3]\n"
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 28f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1894,33 +1909,33 @@
"udot z22.s, z10.b, z5.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 31f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1938,33 +1953,33 @@
"udot z22.s, z14.b, z5.b[1]\n"
"udot z19.s, z15.b, z4.b[1]\n"
"udot z23.s, z15.b, z5.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 34f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1982,7 +1997,7 @@
"udot z22.s, z10.b, z5.b[0]\n"
"udot z19.s, z11.b, z4.b[0]\n"
"udot z23.s, z11.b, z5.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -1995,7 +2010,7 @@
".unreq a_ptr1\n"
".unreq c_ptr1\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
);
break;
@@ -2007,11 +2022,11 @@
"c_ptr2 .req X3\n"
"add a_ptr1, %[a_ptr0], %[lda]\n"
"add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
"whilelt p6.b, %[temp], %[leftovers]\n"
"whilelt p0.s, %[temp], %[width]\n"
"whilelt p4.b, %[temp], %[width]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
"incw %[temp], all, mul #1\n"
"ptrue p7.b\n"
"whilelt p1.s, %[temp], %[width]\n"
@@ -2034,116 +2049,122 @@
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z22.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z23.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"mov z24.s, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "mov z25.s, #0\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "mov z26.s, #0\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"mov z27.s, #0\n"
- "b 2f\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
"mul z24.s, p7/m, z24.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z25.s, p7/m, z25.s, z15.s\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z26.s, p7/m, z26.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
"mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "2:\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "cbz %[loops], 3f\n"
- "4:\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"udot z16.s, z8.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip1 z14.b, z15.b, z8.b\n"
"add a_ptr1, a_ptr1, #0x20\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"udot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
"udot z27.s, z11.b, z2.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2157,15 +2178,15 @@
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
@@ -2185,15 +2206,15 @@
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
@@ -2213,15 +2234,15 @@
"udot z21.s, z13.b, z1.b[3]\n"
"udot z25.s, z13.b, z2.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
"udot z26.s, z14.b, z2.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
@@ -2244,15 +2265,15 @@
"udot z21.s, z9.b, z5.b[0]\n"
"udot z25.s, z9.b, z6.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
"udot z26.s, z10.b, z6.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[0]\n"
@@ -2272,15 +2293,15 @@
"udot z21.s, z13.b, z5.b[1]\n"
"udot z25.s, z13.b, z6.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
"udot z26.s, z14.b, z6.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[1]\n"
@@ -2300,15 +2321,15 @@
"udot z21.s, z9.b, z5.b[2]\n"
"udot z25.s, z9.b, z6.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
"udot z26.s, z10.b, z6.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[2]\n"
@@ -2328,23 +2349,23 @@
"udot z21.s, z13.b, z5.b[3]\n"
"udot z25.s, z13.b, z6.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
"udot z22.s, z14.b, z5.b[3]\n"
"udot z26.s, z14.b, z6.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -2353,24 +2374,24 @@
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
@@ -2390,15 +2411,15 @@
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
@@ -2418,15 +2439,15 @@
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
@@ -2446,15 +2467,15 @@
"udot z21.s, z13.b, z1.b[3]\n"
"udot z25.s, z13.b, z2.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
"udot z26.s, z14.b, z2.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
@@ -2477,15 +2498,15 @@
"udot z21.s, z9.b, z5.b[0]\n"
"udot z25.s, z9.b, z6.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
"udot z26.s, z10.b, z6.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[0]\n"
@@ -2505,12 +2526,12 @@
"udot z21.s, z13.b, z5.b[1]\n"
"udot z25.s, z13.b, z6.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
"udot z22.s, z14.b, z5.b[1]\n"
"udot z26.s, z14.b, z6.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2523,17 +2544,17 @@
"udot z16.s, z8.b, z4.b[2]\n"
"udot z20.s, z8.b, z5.b[2]\n"
"udot z24.s, z8.b, z6.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
"udot z21.s, z9.b, z5.b[2]\n"
"udot z25.s, z9.b, z6.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
"udot z26.s, z10.b, z6.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
@@ -2550,15 +2571,15 @@
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2580,15 +2601,15 @@
"udot z19.s, z11.b, z0.b[0]\n"
"udot z23.s, z11.b, z1.b[0]\n"
"udot z27.s, z11.b, z2.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2610,13 +2631,13 @@
"udot z19.s, z15.b, z0.b[1]\n"
"udot z23.s, z15.b, z1.b[1]\n"
"udot z27.s, z15.b, z2.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -2639,31 +2660,31 @@
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
"udot z27.s, z11.b, z2.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 10f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -2685,33 +2706,33 @@
"udot z19.s, z15.b, z0.b[3]\n"
"udot z23.s, z15.b, z1.b[3]\n"
"udot z27.s, z15.b, z2.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 13f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -2733,33 +2754,33 @@
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
"udot z27.s, z11.b, z2.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 16f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -2781,33 +2802,33 @@
"udot z19.s, z15.b, z0.b[1]\n"
"udot z23.s, z15.b, z1.b[1]\n"
"udot z27.s, z15.b, z2.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 19f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -2829,8 +2850,8 @@
"udot z19.s, z11.b, z0.b[0]\n"
"udot z23.s, z11.b, z1.b[0]\n"
"udot z27.s, z11.b, z2.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -2839,24 +2860,24 @@
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
"udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
@@ -2876,12 +2897,12 @@
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
"udot z22.s, z14.b, z1.b[1]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2894,17 +2915,17 @@
"udot z16.s, z8.b, z0.b[2]\n"
"udot z20.s, z8.b, z1.b[2]\n"
"udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
@@ -2921,15 +2942,15 @@
"udot z19.s, z15.b, z0.b[3]\n"
"udot z23.s, z15.b, z1.b[3]\n"
"udot z27.s, z15.b, z2.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2951,15 +2972,15 @@
"udot z19.s, z11.b, z4.b[0]\n"
"udot z23.s, z11.b, z5.b[0]\n"
"udot z27.s, z11.b, z6.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2981,13 +3002,13 @@
"udot z19.s, z15.b, z4.b[1]\n"
"udot z23.s, z15.b, z5.b[1]\n"
"udot z27.s, z15.b, z6.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3010,31 +3031,31 @@
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
"udot z27.s, z11.b, z6.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 25f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -3056,33 +3077,33 @@
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 28f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -3104,33 +3125,33 @@
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
"udot z27.s, z11.b, z6.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 31f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -3152,33 +3173,33 @@
"udot z19.s, z15.b, z4.b[1]\n"
"udot z23.s, z15.b, z5.b[1]\n"
"udot z27.s, z15.b, z6.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 34f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -3200,7 +3221,7 @@
"udot z19.s, z11.b, z4.b[0]\n"
"udot z23.s, z11.b, z5.b[0]\n"
"udot z27.s, z11.b, z6.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -3219,7 +3240,7 @@
".unreq c_ptr1\n"
".unreq c_ptr2\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
@@ -3234,15 +3255,15 @@
"c_ptr3 .req X5\n"
"add a_ptr1, %[a_ptr0], %[lda]\n"
"add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
"whilelt p6.b, %[temp], %[leftovers]\n"
"whilelt p0.s, %[temp], %[width]\n"
"whilelt p4.b, %[temp], %[width]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
"incw %[temp], all, mul #1\n"
"ptrue p7.b\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
"whilelt p1.s, %[temp], %[width]\n"
"incw %[temp], all, mul #1\n"
"whilelt p2.s, %[temp], %[width]\n"
@@ -3265,77 +3286,80 @@
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z23.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z24.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"mov z25.s, #0\n"
"add a_ptr3, a_ptr3, #0x10\n"
- "mov z26.s, #0\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "mov z27.s, #0\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "mov z28.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "mov z26.s, #0\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z9.b, z9.b, z10.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z28.s, #0\n"
"mov z29.s, #0\n"
"mov z30.s, #0\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
"mov z31.s, #0\n"
- "b 2f\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
"mul z24.s, p7/m, z24.s, z15.s\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
"mul z25.s, p7/m, z25.s, z15.s\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
"mul z26.s, p7/m, z26.s, z15.s\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
"mul z27.s, p7/m, z27.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
"mul z28.s, p7/m, z28.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z29.s, p7/m, z29.s, z15.s\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z30.s, p7/m, z30.s, z15.s\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
"mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add a_ptr2, a_ptr2, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
"add a_ptr3, a_ptr3, #0x10\n"
@@ -3344,21 +3368,20 @@
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "2:\n"
- "cbz %[loops], 3f\n"
- "4:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -3367,38 +3390,38 @@
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"udot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "add a_ptr1, a_ptr1, #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
- "add a_ptr2, a_ptr2, #0x20\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"udot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
"udot z26.s, z10.b, z2.b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"udot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
"udot z27.s, z11.b, z2.b[0]\n"
"udot z31.s, z11.b, z3.b[0]\n"
"zip2 z11.b, z8.b, z9.b\n"
@@ -3414,17 +3437,17 @@
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"udot z30.s, z14.b, z3.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3446,17 +3469,17 @@
"udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"udot z30.s, z10.b, z3.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3478,17 +3501,17 @@
"udot z17.s, z13.b, z0.b[3]\n"
"udot z21.s, z13.b, z1.b[3]\n"
"udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z2.b[3]\n"
"udot z30.s, z14.b, z3.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3514,17 +3537,17 @@
"udot z17.s, z9.b, z4.b[0]\n"
"udot z21.s, z9.b, z5.b[0]\n"
"udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z6.b[0]\n"
"udot z30.s, z10.b, z7.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3546,17 +3569,17 @@
"udot z17.s, z13.b, z4.b[1]\n"
"udot z21.s, z13.b, z5.b[1]\n"
"udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z6.b[1]\n"
"udot z30.s, z14.b, z7.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3578,17 +3601,17 @@
"udot z17.s, z9.b, z4.b[2]\n"
"udot z21.s, z9.b, z5.b[2]\n"
"udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z6.b[2]\n"
"udot z30.s, z10.b, z7.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3610,13 +3633,13 @@
"udot z17.s, z13.b, z4.b[3]\n"
"udot z21.s, z13.b, z5.b[3]\n"
"udot z25.s, z13.b, z6.b[3]\n"
- "udot z29.s, z13.b, z7.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[3]\n"
"udot z22.s, z14.b, z5.b[3]\n"
"udot z26.s, z14.b, z6.b[3]\n"
@@ -3626,11 +3649,11 @@
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
"udot z31.s, z15.b, z7.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -3639,27 +3662,27 @@
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"udot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
"udot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"udot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3681,17 +3704,17 @@
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"udot z30.s, z14.b, z3.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3713,17 +3736,17 @@
"udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"udot z30.s, z10.b, z3.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3745,17 +3768,17 @@
"udot z17.s, z13.b, z0.b[3]\n"
"udot z21.s, z13.b, z1.b[3]\n"
"udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z2.b[3]\n"
"udot z30.s, z14.b, z3.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3781,17 +3804,17 @@
"udot z17.s, z9.b, z4.b[0]\n"
"udot z21.s, z9.b, z5.b[0]\n"
"udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z6.b[0]\n"
"udot z30.s, z10.b, z7.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3813,13 +3836,13 @@
"udot z17.s, z13.b, z4.b[1]\n"
"udot z21.s, z13.b, z5.b[1]\n"
"udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[1]\n"
"udot z22.s, z14.b, z5.b[1]\n"
"udot z26.s, z14.b, z6.b[1]\n"
@@ -3841,11 +3864,11 @@
"udot z17.s, z9.b, z4.b[2]\n"
"udot z21.s, z9.b, z5.b[2]\n"
"udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"udot z22.s, z10.b, z5.b[2]\n"
"udot z26.s, z10.b, z6.b[2]\n"
@@ -3870,15 +3893,15 @@
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
"udot z31.s, z15.b, z7.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -3904,15 +3927,15 @@
"udot z23.s, z11.b, z1.b[0]\n"
"udot z27.s, z11.b, z2.b[0]\n"
"udot z31.s, z11.b, z3.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -3938,13 +3961,13 @@
"udot z23.s, z15.b, z1.b[1]\n"
"udot z27.s, z15.b, z2.b[1]\n"
"udot z31.s, z15.b, z3.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3971,31 +3994,31 @@
"udot z23.s, z11.b, z1.b[2]\n"
"udot z27.s, z11.b, z2.b[2]\n"
"udot z31.s, z11.b, z3.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 10f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4021,33 +4044,33 @@
"udot z23.s, z15.b, z1.b[3]\n"
"udot z27.s, z15.b, z2.b[3]\n"
"udot z31.s, z15.b, z3.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 13f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4073,33 +4096,33 @@
"udot z23.s, z11.b, z1.b[2]\n"
"udot z27.s, z11.b, z2.b[2]\n"
"udot z31.s, z11.b, z3.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 16f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4125,33 +4148,33 @@
"udot z23.s, z15.b, z1.b[1]\n"
"udot z27.s, z15.b, z2.b[1]\n"
"udot z31.s, z15.b, z3.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 19f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4177,8 +4200,8 @@
"udot z23.s, z11.b, z1.b[0]\n"
"udot z27.s, z11.b, z2.b[0]\n"
"udot z31.s, z11.b, z3.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -4187,27 +4210,27 @@
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
"udot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
"udot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"udot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -4229,13 +4252,13 @@
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"udot z22.s, z14.b, z1.b[1]\n"
"udot z26.s, z14.b, z2.b[1]\n"
@@ -4257,11 +4280,11 @@
"udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"udot z22.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[2]\n"
@@ -4286,15 +4309,15 @@
"udot z23.s, z15.b, z1.b[3]\n"
"udot z27.s, z15.b, z2.b[3]\n"
"udot z31.s, z15.b, z3.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -4320,15 +4343,15 @@
"udot z23.s, z11.b, z5.b[0]\n"
"udot z27.s, z11.b, z6.b[0]\n"
"udot z31.s, z11.b, z7.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -4354,13 +4377,13 @@
"udot z23.s, z15.b, z5.b[1]\n"
"udot z27.s, z15.b, z6.b[1]\n"
"udot z31.s, z15.b, z7.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -4387,31 +4410,31 @@
"udot z23.s, z11.b, z5.b[2]\n"
"udot z27.s, z11.b, z6.b[2]\n"
"udot z31.s, z11.b, z7.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 25f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4437,33 +4460,33 @@
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
"udot z31.s, z15.b, z7.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 28f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4489,33 +4512,33 @@
"udot z23.s, z11.b, z5.b[2]\n"
"udot z27.s, z11.b, z6.b[2]\n"
"udot z31.s, z11.b, z7.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 31f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4541,33 +4564,33 @@
"udot z23.s, z15.b, z5.b[1]\n"
"udot z27.s, z15.b, z6.b[1]\n"
"udot z31.s, z15.b, z7.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
"b.eq 34f\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4593,7 +4616,7 @@
"udot z23.s, z11.b, z5.b[0]\n"
"udot z27.s, z11.b, z6.b[0]\n"
"udot z31.s, z11.b, z7.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -4618,7 +4641,7 @@
".unreq c_ptr2\n"
".unreq c_ptr3\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
diff --git a/src/core/NEON/kernels/convolution/common/padding.cpp b/src/core/NEON/kernels/convolution/common/padding.cpp
new file mode 100644
index 0000000..b50067b
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/padding.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cstring>
+#include <cstdint>
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+
+namespace padding
+{
+
+template <typename T>
+void copy_and_pad_tile(
+ const unsigned int tile_rows,
+ const unsigned int tile_cols,
+ const unsigned int n_channels,
+ const T* const inptr,
+ const unsigned int in_row_stride,
+ const unsigned int in_col_stride,
+ T* const outptr,
+ const unsigned int out_row_stride,
+ const unsigned int out_col_stride,
+ const unsigned int pad_top,
+ const unsigned int pad_left,
+ const unsigned int pad_bottom,
+ const unsigned int pad_right,
+ const T pad_value
+)
+{
+ for (unsigned int out_i = 0; out_i < tile_rows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < tile_cols; out_j++)
+ {
+ T* const output = outptr + out_i*out_row_stride + out_j*out_col_stride;
+
+ if (out_i < pad_top || tile_rows - pad_bottom <= out_i ||
+ out_j < pad_left || tile_cols - pad_right <= out_j)
+ {
+ for (unsigned int n = 0; n < n_channels; n++)
+ {
+ output[n] = pad_value;
+ }
+ }
+ else
+ {
+ const auto in_i = out_i - pad_top, in_j = out_j - pad_left;
+ const T* const input = inptr + in_i*in_row_stride + in_j*in_col_stride;
+ std::memcpy(output, input, n_channels * sizeof(T));
+ }
+ }
+ }
+}
+
+template void copy_and_pad_tile(
+ unsigned int, unsigned int, unsigned int,
+ const uint8_t *, unsigned int, unsigned int,
+ uint8_t *, unsigned int, unsigned int,
+ unsigned int, unsigned int, unsigned int, unsigned int, uint8_t
+);
+
+template void copy_and_pad_tile(
+ unsigned int, unsigned int, unsigned int,
+ const float *, unsigned int, unsigned int,
+ float *, unsigned int, unsigned int,
+ unsigned int, unsigned int, unsigned int, unsigned int, float
+);
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template void copy_and_pad_tile(
+ unsigned int, unsigned int, unsigned int,
+ const float16_t *, unsigned int, unsigned int,
+ float16_t *, unsigned int, unsigned int,
+ unsigned int, unsigned int, unsigned int, unsigned int, float16_t
+);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <unsigned int TileRows, unsigned int TileCols>
+void CopyCropped<TileRows, TileCols>::execute(
+ const size_t size,
+ const void * const inptr,
+ const size_t in_row_stride,
+ const size_t in_col_stride,
+ void * const outptr,
+ const size_t out_row_stride,
+ const size_t out_col_stride,
+ const unsigned int pad_top,
+ const unsigned int pad_left,
+ const unsigned int pad_bottom,
+ const unsigned int pad_right
+)
+{
+ for (unsigned int out_i = 0, in_i = pad_top; in_i < TileRows - pad_bottom; out_i++, in_i++)
+ {
+ for (unsigned int out_j = 0, in_j = pad_left; in_j < TileCols - pad_right; out_j++, in_j++)
+ {
+ std::memcpy(
+ static_cast<uint8_t *>(outptr) + out_i*out_row_stride + out_j*out_col_stride,
+ static_cast<const uint8_t *>(inptr) + in_i*in_row_stride + in_j*in_col_stride,
+ size
+ );
+ }
+ }
+}
+
+template class CopyCropped<2, 2>;
+template class CopyCropped<3, 3>;
+template class CopyCropped<4, 4>;
+
+} // namespace padding
diff --git a/src/core/NEON/kernels/convolution/common/qasymm8.cpp b/src/core/NEON/kernels/convolution/common/qasymm8.cpp
new file mode 100644
index 0000000..1de9ebf
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/qasymm8.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+
+#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp"
+
+namespace qasymm8
+{
+#if(__ANDROID__ || BARE_METAL)
+template <typename T> T round(T val) { return ::round(val); }
+template <typename T> T exp2(T val) { return ::exp2(val); }
+template <typename T> T log2(T val) { return ::log2(val); }
+#else /* (__ANDROID__ || BARE_METAL) */
+template <typename T> T round(T val) { return std::round(val); }
+template <typename T> T exp2(T val) { return std::exp2(val); }
+template <typename T> T log2(T val) { return std::log2(val); }
+#endif /* (__ANDROID__ || BARE_METAL) */
+
+uint8_t QAsymm8Params::quantize(const float value) const
+{
+ const float transformed = value / scale + offset;
+ return static_cast<uint8_t>(round(std::max(0.0f, std::min(255.0f, transformed))));
+}
+
+float QAsymm8Params::dequantize(const uint8_t value) const
+{
+ return scale * (static_cast<float>(value) - offset);
+}
+
+QAsymm8RescaleParams QAsymm8RescaleParams::make_rescale_params(
+ const QAsymm8Params& weight_quant,
+ const QAsymm8Params& input_quant,
+ const QAsymm8Params& output_quant
+)
+{
+ // Based on the gemmlowp approach: https://github.com/google/gemmlowp/blob/master/doc/quantization_example.cc
+ const float rescale = weight_quant.scale * input_quant.scale / output_quant.scale;
+ const float shiftf = round(log2(0.5f / rescale));
+ const float multf = exp2(31.0f + shiftf)*rescale;
+
+ int64_t shift = static_cast<int64_t>(shiftf);
+ int64_t mult = static_cast<int64_t>(multf);
+
+ if (mult == (1ll << 31))
+ {
+ mult /= 2;
+ shift--;
+ }
+
+ assert(shift >= 0);
+ assert(mult <= std::numeric_limits<int32_t>::max());
+
+ return QAsymm8RescaleParams(
+ static_cast<int32_t>(shift),
+ static_cast<int32_t>(mult),
+ rescale
+ );
+}
+
+QAsymm8RescaleParams::QAsymm8RescaleParams(int32_t shift, int32_t multi, float rescale)
+ : shift(shift), multiplier(multi), rescale(rescale)
+{
+}
+}
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index ca1de26..1272754 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,546 +25,1144 @@
namespace depthwise
{
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 1, 1, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
#ifdef __aarch64__
-
template <>
template <>
-void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
- const int n_channels,
- const float* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const float* const inptr,
- const int in_row_stride,
- const int in_col_stride,
- float* const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int, const int, const int, const int, const int, const int, const int, const int
+void Conv::execute_tile<ActivationFunction::None>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
)
{
- // Copy pointers
- const float *uptr0 = inptr;
- const float *wptr0 = weights;
- float *vptr0 = outptr;
+ __asm __volatile(
+ "add x26, %[inptr0], %[input_row_stride]\n"
+ "add x21, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x23, %[outptr0], %[output_row_stride]\n"
+ "add x27, x26, %[input_row_stride]\n"
+ "add x22, x21, %[input_col_stride1]\n"
+ "and x24, %[n_channels], #3\n"
+ "add x28, x27, %[input_row_stride]\n"
+ "lsr x25, %[n_channels], #2\n"
+ "cbz x25, 4f\n"
+ "1:\n"
+ "ldr q15, [%[wbptr]]\n"
+ "subs x25, x25, #1\n"
+ "mov v3.16b, v15.16b\n"
+ "ldr q14, [%[wbptr], #16]\n"
+ "mov v1.16b, v15.16b\n"
+ "ldr q13, [%[wbptr], #32]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr q12, [%[wbptr], #48]\n"
+ "mov v0.16b, v15.16b\n"
+ "ldr q11, [%[wbptr], #64]\n"
+ "ldr q10, [%[wbptr], #80]\n"
+ "ldr q9, [%[wbptr], #96]\n"
+ "ldr q8, [%[wbptr], #112]\n"
+ "ldr q7, [%[wbptr], #128]\n"
+ "ldr q6, [%[wbptr], #144]\n"
+ "ldr q24, [%[inptr0]]\n"
+ "fmla v3.4s, v24.4s, v14.4s\n"
+ "ldr q22, [x26]\n"
+ "fmla v1.4s, v22.4s, v14.4s\n"
+ "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v19.4s, v14.4s\n"
+ "ldr q18, [x27]\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "ldr q21, [x26, %[input_col_stride1]]\n"
+ "fmla v1.4s, v18.4s, v11.4s\n"
+ "ldr q17, [%[inptr0], x21]\n"
+ "ldr q20, [x28]\n"
+ "ldr q5, [x27, %[input_col_stride1]]\n"
+ "fmla v3.4s, v19.4s, v13.4s\n"
+ "fmla v3.4s, v18.4s, v8.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v3.4s, v21.4s, v10.4s\n"
+ "ldr q19, [x26, x21]\n"
+ "fmla v1.4s, v21.4s, v13.4s\n"
+ "ldr q23, [%[inptr0], x22]\n"
+ "fmla v2.4s, v21.4s, v11.4s\n"
+ "ldr q22, [x28, %[input_col_stride1]]\n"
+ "fmla v0.4s, v21.4s, v14.4s\n"
+ "ldr q21, [x27, x21]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr q18, [x26, x22]\n"
+ "fmla v2.4s, v17.4s, v13.4s\n"
+ "ldr q16, [x28, x21]\n"
+ "fmla v1.4s, v20.4s, v8.4s\n"
+ "ldr q20, [x27, x22]\n"
+ "fmla v3.4s, v5.4s, v7.4s\n"
+ "ldr q4, [x28, x22]\n"
+ "fmla v2.4s, v5.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v1.4s, v5.4s, v10.4s\n"
+ "ldr q15, [%[wbptr]]\n"
+ "fmla v0.4s, v5.4s, v11.4s\n"
+ "ldr q14, [%[wbptr], #16]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v1.4s, v19.4s, v12.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v2.4s, v19.4s, v10.4s\n"
+ "ldr q11, [%[wbptr], #64]\n"
+ "fmla v0.4s, v19.4s, v13.4s\n"
+ "ldr q24, [%[inptr0]]\n"
+ "fmla v1.4s, v22.4s, v7.4s\n"
+ "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v23.4s, v12.4s\n"
+ "ldr q17, [%[inptr0], x21]\n"
+ "fmla v0.4s, v22.4s, v8.4s\n"
+ "ldr q13, [%[wbptr], #32]\n"
+ "fmla v3.4s, v21.4s, v6.4s\n"
+ "add x26, x26, #16\n"
+ "fmla v1.4s, v21.4s, v9.4s\n"
+ "ldr q22, [x26]\n"
+ "fmla v2.4s, v21.4s, v7.4s\n"
+ "ldr q8, [%[wbptr], #112]\n"
+ "str q3, [%[outptr0]]\n"
+ "fmla v0.4s, v21.4s, v10.4s\n"
+ "fmla v1.4s, v16.4s, v6.4s\n"
+ "ldr q21, [x26, %[input_col_stride1]]\n"
+ "fmla v2.4s, v18.4s, v9.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v0.4s, v18.4s, v12.4s\n"
+ "ldr q10, [%[wbptr], #80]\n"
+ "str q1, [x23]\n"
+ "mov v3.16b, v15.16b\n"
+ "fmla v2.4s, v20.4s, v6.4s\n"
+ "ldr q18, [x27]\n"
+ "fmla v0.4s, v16.4s, v7.4s\n"
+ "ldr q12, [%[wbptr], #48]\n"
+ "mov v1.16b, v15.16b\n"
+ "ldr q5, [x27, %[input_col_stride1]]\n"
+ "str q2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v3.4s, v24.4s, v14.4s\n"
+ "fmla v0.4s, v20.4s, v9.4s\n"
+ "ldr q7, [%[wbptr], #128]\n"
+ "mov v2.16b, v15.16b\n"
+ "add x28, x28, #16\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "ldr q20, [x28]\n"
+ "fmla v0.4s, v4.4s, v6.4s\n"
+ "ldr q9, [%[wbptr], #96]\n"
+ "fmla v1.4s, v22.4s, v14.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v3.4s, v19.4s, v13.4s\n"
+ "subs x25, x25, #1\n"
+ "str q0, [x23, %[output_col_stride1]]\n"
+ "fmla v2.4s, v19.4s, v14.4s\n"
+ "ldr q6, [%[wbptr], #144]\n"
+ "add x23, x23, #16\n"
+ "fmla v3.4s, v18.4s, v8.4s\n"
+ "fmla v1.4s, v18.4s, v11.4s\n"
+ "mov v0.16b, v15.16b\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v3.4s, v21.4s, v10.4s\n"
+ "ldr q19, [x26, x21]\n"
+ "fmla v1.4s, v21.4s, v13.4s\n"
+ "ldr q23, [%[inptr0], x22]\n"
+ "fmla v2.4s, v21.4s, v11.4s\n"
+ "ldr q22, [x28, %[input_col_stride1]]\n"
+ "fmla v0.4s, v21.4s, v14.4s\n"
+ "ldr q21, [x27, x21]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr q18, [x26, x22]\n"
+ "fmla v2.4s, v17.4s, v13.4s\n"
+ "ldr q16, [x28, x21]\n"
+ "fmla v1.4s, v20.4s, v8.4s\n"
+ "ldr q20, [x27, x22]\n"
+ "fmla v3.4s, v5.4s, v7.4s\n"
+ "ldr q4, [x28, x22]\n"
+ "fmla v2.4s, v5.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v1.4s, v5.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v5.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "add x26, x26, #16\n"
+ "fmla v1.4s, v19.4s, v12.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v2.4s, v19.4s, v10.4s\n"
+ "add x28, x28, #16\n"
+ "fmla v0.4s, v19.4s, v13.4s\n"
+ "fmla v3.4s, v21.4s, v6.4s\n"
+ "fmla v1.4s, v22.4s, v7.4s\n"
+ "fmla v2.4s, v23.4s, v12.4s\n"
+ "str q3, [%[outptr0]]\n"
+ "fmla v0.4s, v22.4s, v8.4s\n"
+ "fmla v1.4s, v21.4s, v9.4s\n"
+ "fmla v2.4s, v21.4s, v7.4s\n"
+ "fmla v0.4s, v21.4s, v10.4s\n"
+ "fmla v1.4s, v16.4s, v6.4s\n"
+ "fmla v2.4s, v18.4s, v9.4s\n"
+ "fmla v0.4s, v18.4s, v12.4s\n"
+ "str q1, [x23]\n"
+ "fmla v2.4s, v20.4s, v6.4s\n"
+ "fmla v0.4s, v16.4s, v7.4s\n"
+ "str q2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v0.4s, v20.4s, v9.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v0.4s, v4.4s, v6.4s\n"
+ "str q0, [x23, %[output_col_stride1]]\n"
+ "add x23, x23, #16\n"
+ "4:\n"
+ "cbz x24, 7f\n"
+ "ldr s15, [%[wbptr]]\n"
+ "mov v3.16b, v15.16b\n"
+ "ldr s14, [%[wbptr], #4]\n"
+ "mov v1.16b, v15.16b\n"
+ "ldr s13, [%[wbptr], #8]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr s12, [%[wbptr], #12]\n"
+ "mov v0.16b, v15.16b\n"
+ "ldr s11, [%[wbptr], #16]\n"
+ "ldr s10, [%[wbptr], #20]\n"
+ "subs x24, x24, #1\n"
+ "ldr s9, [%[wbptr], #24]\n"
+ "ldr s8, [%[wbptr], #28]\n"
+ "ldr s7, [%[wbptr], #32]\n"
+ "ldr s6, [%[wbptr], #36]\n"
+ "ldr s24, [%[inptr0]]\n"
+ "ldr s22, [x26]\n"
+ "fmla v3.4s, v24.4s, v14.4s\n"
+ "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v1.4s, v22.4s, v14.4s\n"
+ "ldr s18, [x27]\n"
+ "fmla v2.4s, v19.4s, v14.4s\n"
+ "ldr s21, [x26, %[input_col_stride1]]\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "ldr s17, [%[inptr0], x21]\n"
+ "fmla v1.4s, v18.4s, v11.4s\n"
+ "ldr s20, [x28]\n"
+ "ldr s5, [x27, %[input_col_stride1]]\n"
+ "fmla v3.4s, v19.4s, v13.4s\n"
+ "fmla v3.4s, v18.4s, v8.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v3.4s, v21.4s, v10.4s\n"
+ "ldr s19, [x26, x21]\n"
+ "fmla v1.4s, v21.4s, v13.4s\n"
+ "ldr s23, [%[inptr0], x22]\n"
+ "fmla v2.4s, v21.4s, v11.4s\n"
+ "ldr s22, [x28, %[input_col_stride1]]\n"
+ "fmla v0.4s, v21.4s, v14.4s\n"
+ "ldr s21, [x27, x21]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr s18, [x26, x22]\n"
+ "fmla v2.4s, v17.4s, v13.4s\n"
+ "ldr s16, [x28, x21]\n"
+ "fmla v1.4s, v20.4s, v8.4s\n"
+ "ldr s20, [x27, x22]\n"
+ "fmla v3.4s, v5.4s, v7.4s\n"
+ "ldr s4, [x28, x22]\n"
+ "fmla v2.4s, v5.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v1.4s, v5.4s, v10.4s\n"
+ "ldr s15, [%[wbptr]]\n"
+ "fmla v0.4s, v5.4s, v11.4s\n"
+ "ldr s14, [%[wbptr], #4]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v1.4s, v19.4s, v12.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v2.4s, v19.4s, v10.4s\n"
+ "ldr s11, [%[wbptr], #16]\n"
+ "fmla v0.4s, v19.4s, v13.4s\n"
+ "ldr s24, [%[inptr0]]\n"
+ "fmla v1.4s, v22.4s, v7.4s\n"
+ "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v23.4s, v12.4s\n"
+ "ldr s17, [%[inptr0], x21]\n"
+ "fmla v0.4s, v22.4s, v8.4s\n"
+ "ldr s13, [%[wbptr], #8]\n"
+ "fmla v3.4s, v21.4s, v6.4s\n"
+ "add x26, x26, #4\n"
+ "fmla v1.4s, v21.4s, v9.4s\n"
+ "ldr s22, [x26]\n"
+ "fmla v2.4s, v21.4s, v7.4s\n"
+ "ldr s8, [%[wbptr], #28]\n"
+ "str s3, [%[outptr0]]\n"
+ "fmla v0.4s, v21.4s, v10.4s\n"
+ "fmla v1.4s, v16.4s, v6.4s\n"
+ "ldr s21, [x26, %[input_col_stride1]]\n"
+ "fmla v2.4s, v18.4s, v9.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v0.4s, v18.4s, v12.4s\n"
+ "ldr s10, [%[wbptr], #20]\n"
+ "str s1, [x23]\n"
+ "mov v3.16b, v15.16b\n"
+ "fmla v2.4s, v20.4s, v6.4s\n"
+ "ldr s18, [x27]\n"
+ "fmla v0.4s, v16.4s, v7.4s\n"
+ "ldr s12, [%[wbptr], #12]\n"
+ "mov v1.16b, v15.16b\n"
+ "ldr s5, [x27, %[input_col_stride1]]\n"
+ "str s2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v3.4s, v24.4s, v14.4s\n"
+ "fmla v0.4s, v20.4s, v9.4s\n"
+ "ldr s7, [%[wbptr], #32]\n"
+ "mov v2.16b, v15.16b\n"
+ "add x28, x28, #4\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "ldr s20, [x28]\n"
+ "fmla v0.4s, v4.4s, v6.4s\n"
+ "ldr s9, [%[wbptr], #24]\n"
+ "fmla v1.4s, v22.4s, v14.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v3.4s, v19.4s, v13.4s\n"
+ "subs x24, x24, #1\n"
+ "str s0, [x23, %[output_col_stride1]]\n"
+ "fmla v2.4s, v19.4s, v14.4s\n"
+ "ldr s6, [%[wbptr], #36]\n"
+ "add x23, x23, #4\n"
+ "fmla v3.4s, v18.4s, v8.4s\n"
+ "fmla v1.4s, v18.4s, v11.4s\n"
+ "mov v0.16b, v15.16b\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v3.4s, v21.4s, v10.4s\n"
+ "ldr s19, [x26, x21]\n"
+ "fmla v1.4s, v21.4s, v13.4s\n"
+ "ldr s23, [%[inptr0], x22]\n"
+ "fmla v2.4s, v21.4s, v11.4s\n"
+ "ldr s22, [x28, %[input_col_stride1]]\n"
+ "fmla v0.4s, v21.4s, v14.4s\n"
+ "ldr s21, [x27, x21]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr s18, [x26, x22]\n"
+ "fmla v2.4s, v17.4s, v13.4s\n"
+ "ldr s16, [x28, x21]\n"
+ "fmla v1.4s, v20.4s, v8.4s\n"
+ "ldr s20, [x27, x22]\n"
+ "fmla v3.4s, v5.4s, v7.4s\n"
+ "ldr s4, [x28, x22]\n"
+ "fmla v2.4s, v5.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v1.4s, v5.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v5.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "add x26, x26, #4\n"
+ "fmla v1.4s, v19.4s, v12.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v2.4s, v19.4s, v10.4s\n"
+ "add x28, x28, #4\n"
+ "fmla v0.4s, v19.4s, v13.4s\n"
+ "fmla v3.4s, v21.4s, v6.4s\n"
+ "fmla v1.4s, v22.4s, v7.4s\n"
+ "fmla v2.4s, v23.4s, v12.4s\n"
+ "str s3, [%[outptr0]]\n"
+ "fmla v0.4s, v22.4s, v8.4s\n"
+ "fmla v1.4s, v21.4s, v9.4s\n"
+ "fmla v2.4s, v21.4s, v7.4s\n"
+ "fmla v0.4s, v21.4s, v10.4s\n"
+ "fmla v1.4s, v16.4s, v6.4s\n"
+ "fmla v2.4s, v18.4s, v9.4s\n"
+ "fmla v0.4s, v18.4s, v12.4s\n"
+ "str s1, [x23]\n"
+ "fmla v2.4s, v20.4s, v6.4s\n"
+ "fmla v0.4s, v16.4s, v7.4s\n"
+ "str s2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v0.4s, v20.4s, v9.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v0.4s, v4.4s, v6.4s\n"
+ "str s0, [x23, %[output_col_stride1]]\n"
+ "add x23, x23, #4\n"
+ "7:\n"
+ : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
+ : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
- int channels_remaining = n_channels;
- if (channels_remaining >= 4)
- {
- // Process blocks of 4 channels at a time
- int n_iters = ((channels_remaining / 4) + 1)/2 - 1;
- const bool odd_tail = (channels_remaining / 4) & 1;
- channels_remaining %= 4;
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
+)
+{
+ __asm __volatile(
+ "add x21, %[inptr0], %[input_row_stride]\n"
+ "add x24, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x22, %[outptr0], %[output_row_stride]\n"
+ "add x23, x21, %[input_row_stride]\n"
+ "add x27, x24, %[input_col_stride1]\n"
+ "and x25, %[n_channels], #3\n"
+ "add x28, x23, %[input_row_stride]\n"
+ "lsr x26, %[n_channels], #2\n"
+ "cbz x26, 4f\n"
+ "1:\n"
+ "ldr q11, [%[wbptr]]\n"
+ "subs x26, x26, #1\n"
+ "mov v17.16b, v11.16b\n"
+ "ldr q13, [%[wbptr], #16]\n"
+ "mov v15.16b, v11.16b\n"
+ "ldr q4, [%[wbptr], #32]\n"
+ "mov v16.16b, v11.16b\n"
+ "ldr q2, [%[wbptr], #48]\n"
+ "mov v14.16b, v11.16b\n"
+ "ldr q5, [%[wbptr], #64]\n"
+ "ldr q10, [%[wbptr], #80]\n"
+ "ldr q1, [%[wbptr], #96]\n"
+ "ldr q12, [%[wbptr], #112]\n"
+ "ldr q0, [%[wbptr], #128]\n"
+ "ldr q3, [%[wbptr], #144]\n"
+ "ldr q6, [%[inptr0]]\n"
+ "fmla v17.4s, v6.4s, v13.4s\n"
+ "ldr q27, [x21]\n"
+ "fmla v15.4s, v27.4s, v13.4s\n"
+ "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v16.4s, v23.4s, v13.4s\n"
+ "ldr q24, [x23]\n"
+ "fmla v17.4s, v27.4s, v5.4s\n"
+ "ldr q22, [x21, %[input_col_stride1]]\n"
+ "ldr q9, [%[inptr0], x24]\n"
+ "ldr q8, [x28]\n"
+ "ldr q20, [x23, %[input_col_stride1]]\n"
+ "fmla v17.4s, v23.4s, v4.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v17.4s, v24.4s, v12.4s\n"
+ "ldr q26, [x21, x24]\n"
+ "fmla v15.4s, v24.4s, v5.4s\n"
+ "ldr q27, [%[inptr0], x27]\n"
+ "fmla v16.4s, v22.4s, v5.4s\n"
+ "ldr q25, [x28, %[input_col_stride1]]\n"
+ "fmla v17.4s, v22.4s, v10.4s\n"
+ "ldr q24, [x23, x24]\n"
+ "fmla v15.4s, v22.4s, v4.4s\n"
+ "ldr q21, [x21, x27]\n"
+ "fmla v14.4s, v22.4s, v13.4s\n"
+ "ldr q7, [x28, x24]\n"
+ "fmla v17.4s, v9.4s, v2.4s\n"
+ "ldr q19, [x23, x27]\n"
+ "fmla v16.4s, v9.4s, v4.4s\n"
+ "ldr q18, [x28, x27]\n"
+ "fmla v15.4s, v8.4s, v12.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v17.4s, v20.4s, v0.4s\n"
+ "ldr q11, [%[wbptr]]\n"
+ "fmla v16.4s, v20.4s, v12.4s\n"
+ "ldr q13, [%[wbptr], #16]\n"
+ "fmla v15.4s, v20.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v14.4s, v20.4s, v5.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v17.4s, v26.4s, v1.4s\n"
+ "ldr q6, [%[inptr0]]\n"
+ "fmla v15.4s, v26.4s, v2.4s\n"
+ "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v16.4s, v26.4s, v10.4s\n"
+ "ldr q5, [%[wbptr], #64]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr q9, [%[inptr0], x24]\n"
+ "fmla v15.4s, v25.4s, v0.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v16.4s, v27.4s, v2.4s\n"
+ "ldr q27, [x21]\n"
+ "fmla v14.4s, v25.4s, v12.4s\n"
+ "ldr q4, [%[wbptr], #32]\n"
+ "fmla v17.4s, v24.4s, v3.4s\n"
+ "ldr q22, [x21, %[input_col_stride1]]\n"
+ "fmla v15.4s, v24.4s, v1.4s\n"
+ "add x23, x23, #16\n"
+ "fmla v16.4s, v24.4s, v0.4s\n"
+ "ldr q12, [%[wbptr], #112]\n"
+ "fmla v14.4s, v24.4s, v10.4s\n"
+ "ldr q24, [x23]\n"
+ "fmla v15.4s, v7.4s, v3.4s\n"
+ "ldr q20, [x23, %[input_col_stride1]]\n"
+ "fmla v16.4s, v21.4s, v1.4s\n"
+ "add x28, x28, #16\n"
+ "fmla v14.4s, v21.4s, v2.4s\n"
+ "ldr q10, [%[wbptr], #80]\n"
+ "movi v26.16b, #0\n"
+ "ldr q8, [x28]\n"
+ "fmla v16.4s, v19.4s, v3.4s\n"
+ "subs x26, x26, #1\n"
+ "fmla v14.4s, v7.4s, v0.4s\n"
+ "ldr q2, [%[wbptr], #48]\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "str q17, [%[outptr0]]\n"
+ "str q16, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v14.4s, v19.4s, v1.4s\n"
+ "str q15, [x22]\n"
+ "mov v17.16b, v11.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "ldr q0, [%[wbptr], #128]\n"
+ "fmla v14.4s, v18.4s, v3.4s\n"
+ "ldr q1, [%[wbptr], #96]\n"
+ "mov v16.16b, v11.16b\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v17.4s, v6.4s, v13.4s\n"
+ "fmla v15.4s, v27.4s, v13.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "ldr q3, [%[wbptr], #144]\n"
+ "fmla v16.4s, v23.4s, v13.4s\n"
+ "str q14, [x22, %[output_col_stride1]]\n"
+ "mov v14.16b, v11.16b\n"
+ "add x22, x22, #16\n"
+ "fmla v17.4s, v27.4s, v5.4s\n"
+ "fmla v17.4s, v23.4s, v4.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v17.4s, v24.4s, v12.4s\n"
+ "ldr q26, [x21, x24]\n"
+ "fmla v15.4s, v24.4s, v5.4s\n"
+ "ldr q27, [%[inptr0], x27]\n"
+ "fmla v16.4s, v22.4s, v5.4s\n"
+ "ldr q25, [x28, %[input_col_stride1]]\n"
+ "fmla v17.4s, v22.4s, v10.4s\n"
+ "ldr q24, [x23, x24]\n"
+ "fmla v15.4s, v22.4s, v4.4s\n"
+ "ldr q21, [x21, x27]\n"
+ "fmla v14.4s, v22.4s, v13.4s\n"
+ "ldr q7, [x28, x24]\n"
+ "fmla v17.4s, v9.4s, v2.4s\n"
+ "ldr q19, [x23, x27]\n"
+ "fmla v16.4s, v9.4s, v4.4s\n"
+ "ldr q18, [x28, x27]\n"
+ "fmla v15.4s, v8.4s, v12.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v17.4s, v20.4s, v0.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v20.4s, v12.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v15.4s, v20.4s, v10.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v14.4s, v20.4s, v5.4s\n"
+ "add x23, x23, #16\n"
+ "fmla v17.4s, v26.4s, v1.4s\n"
+ "add x28, x28, #16\n"
+ "fmla v15.4s, v26.4s, v2.4s\n"
+ "fmla v16.4s, v26.4s, v10.4s\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "movi v26.16b, #0\n"
+ "fmla v17.4s, v24.4s, v3.4s\n"
+ "fmla v16.4s, v27.4s, v2.4s\n"
+ "fmla v15.4s, v25.4s, v0.4s\n"
+ "fmla v14.4s, v25.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmla v16.4s, v24.4s, v0.4s\n"
+ "str q17, [%[outptr0]]\n"
+ "fmla v15.4s, v24.4s, v1.4s\n"
+ "fmla v14.4s, v24.4s, v10.4s\n"
+ "fmla v16.4s, v21.4s, v1.4s\n"
+ "fmla v15.4s, v7.4s, v3.4s\n"
+ "fmla v14.4s, v21.4s, v2.4s\n"
+ "fmla v16.4s, v19.4s, v3.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "fmla v14.4s, v7.4s, v0.4s\n"
+ "str q15, [x22]\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmla v14.4s, v19.4s, v1.4s\n"
+ "str q16, [%[outptr0], %[output_col_stride1]]\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v14.4s, v18.4s, v3.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "str q14, [x22, %[output_col_stride1]]\n"
+ "add x22, x22, #16\n"
+ "4:\n"
+ "cbz x25, 7f\n"
+ "ldr s11, [%[wbptr]]\n"
+ "mov v17.16b, v11.16b\n"
+ "ldr s13, [%[wbptr], #4]\n"
+ "mov v15.16b, v11.16b\n"
+ "ldr s4, [%[wbptr], #8]\n"
+ "mov v16.16b, v11.16b\n"
+ "ldr s2, [%[wbptr], #12]\n"
+ "mov v14.16b, v11.16b\n"
+ "ldr s5, [%[wbptr], #16]\n"
+ "ldr s10, [%[wbptr], #20]\n"
+ "subs x25, x25, #1\n"
+ "ldr s1, [%[wbptr], #24]\n"
+ "ldr s12, [%[wbptr], #28]\n"
+ "ldr s0, [%[wbptr], #32]\n"
+ "ldr s3, [%[wbptr], #36]\n"
+ "ldr s6, [%[inptr0]]\n"
+ "ldr s27, [x21]\n"
+ "fmla v17.4s, v6.4s, v13.4s\n"
+ "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v15.4s, v27.4s, v13.4s\n"
+ "ldr s24, [x23]\n"
+ "fmla v16.4s, v23.4s, v13.4s\n"
+ "ldr s22, [x21, %[input_col_stride1]]\n"
+ "fmla v17.4s, v27.4s, v5.4s\n"
+ "ldr s9, [%[inptr0], x24]\n"
+ "ldr s8, [x28]\n"
+ "ldr s20, [x23, %[input_col_stride1]]\n"
+ "fmla v17.4s, v23.4s, v4.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v17.4s, v24.4s, v12.4s\n"
+ "ldr s26, [x21, x24]\n"
+ "fmla v15.4s, v24.4s, v5.4s\n"
+ "ldr s27, [%[inptr0], x27]\n"
+ "fmla v16.4s, v22.4s, v5.4s\n"
+ "ldr s25, [x28, %[input_col_stride1]]\n"
+ "fmla v17.4s, v22.4s, v10.4s\n"
+ "ldr s24, [x23, x24]\n"
+ "fmla v15.4s, v22.4s, v4.4s\n"
+ "ldr s21, [x21, x27]\n"
+ "fmla v14.4s, v22.4s, v13.4s\n"
+ "ldr s7, [x28, x24]\n"
+ "fmla v17.4s, v9.4s, v2.4s\n"
+ "ldr s19, [x23, x27]\n"
+ "fmla v16.4s, v9.4s, v4.4s\n"
+ "ldr s18, [x28, x27]\n"
+ "fmla v15.4s, v8.4s, v12.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v17.4s, v20.4s, v0.4s\n"
+ "ldr s11, [%[wbptr]]\n"
+ "fmla v16.4s, v20.4s, v12.4s\n"
+ "ldr s13, [%[wbptr], #4]\n"
+ "fmla v15.4s, v20.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v14.4s, v20.4s, v5.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v17.4s, v26.4s, v1.4s\n"
+ "ldr s6, [%[inptr0]]\n"
+ "fmla v15.4s, v26.4s, v2.4s\n"
+ "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v16.4s, v26.4s, v10.4s\n"
+ "ldr s5, [%[wbptr], #16]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr s9, [%[inptr0], x24]\n"
+ "fmla v15.4s, v25.4s, v0.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v16.4s, v27.4s, v2.4s\n"
+ "ldr s27, [x21]\n"
+ "fmla v14.4s, v25.4s, v12.4s\n"
+ "ldr s4, [%[wbptr], #8]\n"
+ "fmla v17.4s, v24.4s, v3.4s\n"
+ "ldr s22, [x21, %[input_col_stride1]]\n"
+ "fmla v15.4s, v24.4s, v1.4s\n"
+ "add x23, x23, #4\n"
+ "fmla v16.4s, v24.4s, v0.4s\n"
+ "ldr s12, [%[wbptr], #28]\n"
+ "fmla v14.4s, v24.4s, v10.4s\n"
+ "ldr s24, [x23]\n"
+ "fmla v15.4s, v7.4s, v3.4s\n"
+ "ldr s20, [x23, %[input_col_stride1]]\n"
+ "fmla v16.4s, v21.4s, v1.4s\n"
+ "add x28, x28, #4\n"
+ "fmla v14.4s, v21.4s, v2.4s\n"
+ "ldr s10, [%[wbptr], #20]\n"
+ "movi v26.16b, #0\n"
+ "ldr s8, [x28]\n"
+ "fmla v16.4s, v19.4s, v3.4s\n"
+ "subs x25, x25, #1\n"
+ "fmla v14.4s, v7.4s, v0.4s\n"
+ "ldr s2, [%[wbptr], #12]\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "str s17, [%[outptr0]]\n"
+ "str s16, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v14.4s, v19.4s, v1.4s\n"
+ "str s15, [x22]\n"
+ "mov v17.16b, v11.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "ldr s0, [%[wbptr], #32]\n"
+ "fmla v14.4s, v18.4s, v3.4s\n"
+ "ldr s1, [%[wbptr], #24]\n"
+ "mov v16.16b, v11.16b\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v17.4s, v6.4s, v13.4s\n"
+ "fmla v15.4s, v27.4s, v13.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "ldr s3, [%[wbptr], #36]\n"
+ "fmla v16.4s, v23.4s, v13.4s\n"
+ "str s14, [x22, %[output_col_stride1]]\n"
+ "mov v14.16b, v11.16b\n"
+ "add x22, x22, #4\n"
+ "fmla v17.4s, v27.4s, v5.4s\n"
+ "fmla v17.4s, v23.4s, v4.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v17.4s, v24.4s, v12.4s\n"
+ "ldr s26, [x21, x24]\n"
+ "fmla v15.4s, v24.4s, v5.4s\n"
+ "ldr s27, [%[inptr0], x27]\n"
+ "fmla v16.4s, v22.4s, v5.4s\n"
+ "ldr s25, [x28, %[input_col_stride1]]\n"
+ "fmla v17.4s, v22.4s, v10.4s\n"
+ "ldr s24, [x23, x24]\n"
+ "fmla v15.4s, v22.4s, v4.4s\n"
+ "ldr s21, [x21, x27]\n"
+ "fmla v14.4s, v22.4s, v13.4s\n"
+ "ldr s7, [x28, x24]\n"
+ "fmla v17.4s, v9.4s, v2.4s\n"
+ "ldr s19, [x23, x27]\n"
+ "fmla v16.4s, v9.4s, v4.4s\n"
+ "ldr s18, [x28, x27]\n"
+ "fmla v15.4s, v8.4s, v12.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v17.4s, v20.4s, v0.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v20.4s, v12.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v15.4s, v20.4s, v10.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v14.4s, v20.4s, v5.4s\n"
+ "add x23, x23, #4\n"
+ "fmla v17.4s, v26.4s, v1.4s\n"
+ "add x28, x28, #4\n"
+ "fmla v15.4s, v26.4s, v2.4s\n"
+ "fmla v16.4s, v26.4s, v10.4s\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "movi v26.16b, #0\n"
+ "fmla v17.4s, v24.4s, v3.4s\n"
+ "fmla v16.4s, v27.4s, v2.4s\n"
+ "fmla v15.4s, v25.4s, v0.4s\n"
+ "fmla v14.4s, v25.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmla v16.4s, v24.4s, v0.4s\n"
+ "str s17, [%[outptr0]]\n"
+ "fmla v15.4s, v24.4s, v1.4s\n"
+ "fmla v14.4s, v24.4s, v10.4s\n"
+ "fmla v16.4s, v21.4s, v1.4s\n"
+ "fmla v15.4s, v7.4s, v3.4s\n"
+ "fmla v14.4s, v21.4s, v2.4s\n"
+ "fmla v16.4s, v19.4s, v3.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "fmla v14.4s, v7.4s, v0.4s\n"
+ "str s15, [x22]\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmla v14.4s, v19.4s, v1.4s\n"
+ "str s16, [%[outptr0], %[output_col_stride1]]\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v14.4s, v18.4s, v3.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "str s14, [x22, %[output_col_stride1]]\n"
+ "add x22, x22, #4\n"
+ "7:\n"
+ : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
+ : [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
- asm volatile (
- "qW11B .req q0\n" "vW11B .req v0\n" "qW33A .req q1\n" "qU32B .req q1\n"
- "vW33A .req v1\n" "vU32B .req v1\n" "qU44B .req q2\n" "qW21A .req q2\n"
- "vU44B .req v2\n" "vW21A .req v2\n" "qU21B .req q3\n" "qU32A .req q3\n"
- "vU21B .req v3\n" "vU32A .req v3\n" "qU43A .req q4\n" "qV21B .req q4\n"
- "vU43A .req v4\n" "vV21B .req v4\n" "qU24A .req q5\n" "qU44A .req q5\n"
- "qU33B .req q5\n" "vU24A .req v5\n" "vU44A .req v5\n" "vU33B .req v5\n"
- "qU31A .req q6\n" "qV12B .req q6\n" "qU23A .req q6\n" "vU31A .req v6\n"
- "vV12B .req v6\n" "vU23A .req v6\n" "qW31B .req q7\n" "qV22A .req q7\n"
- "vW31B .req v7\n" "vV22A .req v7\n" "qV12A .req q8\n" "qW21B .req q8\n"
- "vV12A .req v8\n" "vW21B .req v8\n" "qU22B .req q9\n" "qU34A .req q9\n"
- "vU22B .req v9\n" "vU34A .req v9\n" "qU13B .req q10\n" "qU13A .req q10\n"
- "vU13B .req v10\n" "vU13A .req v10\n" "qU34B .req q11\n" "qU22A .req q11\n"
- "vU34B .req v11\n" "vU22A .req v11\n" "qU24B .req q12\n" "qU31B .req q12\n"
- "vU24B .req v12\n" "vU31B .req v12\n" "qW12B .req q13\n" "qW13A .req q13\n"
- "vW12B .req v13\n" "vW13A .req v13\n" "qV21A .req q14\n" "qV11B .req q14\n"
- "vV21A .req v14\n" "vV11B .req v14\n" "qW32A .req q15\n" "qW32B .req q15\n"
- "vW32A .req v15\n" "vW32B .req v15\n" "qW31A .req q16\n" "qV22B .req q16\n"
- "vW31A .req v16\n" "vV22B .req v16\n"
- "qW11A .req q17\n" "vW11A .req v17\n" "qW13B .req q18\n" "qU14A .req q18\n"
- "vW13B .req v18\n" "vU14A .req v18\n" "qU33A .req q19\n" "qW33B .req q19\n"
- "vU33A .req v19\n" "vW33B .req v19\n" "qW22A .req q20\n" "qU23B .req q20\n"
- "vW22A .req v20\n" "vU23B .req v20\n" "qU12A .req q21\n" "qU42A .req q21\n"
- "vU12A .req v21\n" "vU42A .req v21\n" "qU41A .req q22\n" "qU42B .req q22\n"
- "vU41A .req v22\n" "vU42B .req v22\n" "qW23A .req q23\n" "qW23B .req q23\n"
- "vW23A .req v23\n" "vW23B .req v23\n" "qU43B .req q24\n" "qU11A .req q24\n"
- "vU43B .req v24\n" "vU11A .req v24\n" "qU12B .req q25\n" "qW12A .req q25\n"
- "vU12B .req v25\n" "vW12A .req v25\n" "qU41B .req q26\n" "qV11A .req q26\n"
- "vU41B .req v26\n" "vV11A .req v26\n" "qW22B .req q27\n" "vW22B .req v27\n"
- "qU11B .req q28\n" "qU14B .req q28\n" "vU11B .req v28\n" "vU14B .req v28\n"
- "qU21A .req q29\n" "vU21A .req v29\n"
-
- "u_col_stride1 .req %x[u_col_stride]\n"
- "u_col_stride2 .req x0\n"
- "u_col_stride3 .req x1\n"
- "uptr1 .req x2\n"
- "uptr2 .req x3\n"
- "uptr3 .req x4\n"
- "wptr1 .req x5\n"
- "wptr2 .req x6\n"
- "vptr1 .req x7\n"
- "w_col_stride1 .req %x[w_col_stride]\n"
- "w_col_stride2 .req x8\n"
-
- // Prepare strides and pointers
- "add uptr1, %x[uptr0], %x[u_row_stride]\n"
- "add uptr2, uptr1 , %x[u_row_stride]\n"
- "add uptr3, uptr2 , %x[u_row_stride]\n"
- "add wptr1, %x[wptr0], %x[w_row_stride]\n"
- "add wptr2, wptr1 , %x[w_row_stride]\n"
- "add vptr1, %x[vptr0], %x[v_row_stride]\n"
- "add u_col_stride2, %x[u_col_stride], %x[u_col_stride]\n"
- "add u_col_stride3, u_col_stride2 , %x[u_col_stride]\n"
- "add w_col_stride2, %x[w_col_stride], %x[w_col_stride]\n"
-
- // Load in preparation for execution
- "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
- "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
- "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
- "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
- "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
- "ldr qW11A, [%x[wptr0]], #0x10\n"
- "ldr qU24A, [uptr1, u_col_stride3]\n"
- "ldr qW23A, [wptr1, w_col_stride2]\n"
- "ldr qU23A, [uptr1, u_col_stride2]\n"
- "ldr qW22A, [wptr1, w_col_stride1]\n"
- "ldr qU22A, [uptr1, u_col_stride1]\n"
- "ldr qW21A, [wptr1], #0x10\n"
- "ldr qU34A, [uptr2, u_col_stride3]\n"
- "ldr qW33A, [wptr2, w_col_stride2]\n"
- "ldr qU33A, [uptr2, u_col_stride2]\n"
- "ldr qW32A, [wptr2, w_col_stride1]\n"
- "ldr qU32A, [uptr2, u_col_stride1]\n"
- "ldr qW31A, [wptr2], #0x10\n"
- "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
- "cbz %x[iters], 2f\n" // Jump to tail if doing zero iterations of loop
-
- "1:" // Main loop body
- // A part
- "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
- "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
- "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
- "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
- "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
- "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
- "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
- "ldr qU44A, [uptr3, u_col_stride3]\n"
- "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
- "ldr qU43A, [uptr3, u_col_stride2]\n"
- "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
- "ldr qU42A, [uptr3, u_col_stride1]\n"
- "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
- "ldr qU11A, [%x[uptr0]], #0x10\n"
- "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
- "ldr qU21A, [uptr1], #0x10\n"
- "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
- "ldr qU31A, [uptr2], #0x10\n"
- "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
- "ldr qU41A, [uptr3], #0x10\n"
- "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
- "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
- "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
- "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
- "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
- "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
- "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
- "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
- "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
- "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
- "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
- "ldr qW11B, [%x[wptr0]], #0x10\n"
- "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
- "ldr qU24B, [uptr1, u_col_stride3]\n"
- "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
- "ldr qW23B, [wptr1, w_col_stride2]\n"
- "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
- "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
- "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
- "ldr qU23B, [uptr1, u_col_stride2]\n"
- "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
- "ldr qW22B, [wptr1, w_col_stride1]\n"
- "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
- "ldr qU22B, [uptr1, u_col_stride1]\n"
- "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
- "ldr qW21B, [wptr1], #0x10\n"
- "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
- "ldr qU34B, [uptr2, u_col_stride3]\n"
- "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
- "ldr qW33B, [wptr2, w_col_stride2]\n"
- "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
- "str qV22A, [vptr1, %x[v_col_stride]]\n"
- "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
- "ldr qU33B, [uptr2, u_col_stride2]\n"
- "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
- "ldr qW32B, [wptr2, w_col_stride1]\n"
- "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
- "ldr qU32B, [uptr2, u_col_stride1]\n"
- "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
- "str qV11A, [%x[vptr0]], #0x10\n"
- "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
- "ldr qW31B, [wptr2], #0x10\n"
- "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
- "str qV21A, [vptr1], #0x10\n"
-
- // B part
- "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
- "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
- "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
- "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
- "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
- "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
- "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
- "subs %x[iters], %x[iters], #1\n"
- "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
- "ldr qU44B, [uptr3, u_col_stride3]\n"
- "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
- "ldr qU43B, [uptr3, u_col_stride2]\n"
- "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
- "ldr qU42B, [uptr3, u_col_stride1]\n"
- "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
- "ldr qU11B, [%x[uptr0]], #0x10\n"
- "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
- "ldr qU21B, [uptr1], #0x10\n"
- "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
- "ldr qU31B, [uptr2], #0x10\n"
- "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
- "ldr qU41B, [uptr3], #0x10\n"
- "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
- "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
- "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
- "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
- "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
- "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
- "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
- "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
- "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
- "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
- "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
- "ldr qW11A, [%x[wptr0]], #0x10\n"
- "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
- "ldr qU24A, [uptr1, u_col_stride3]\n"
- "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
- "ldr qW23A, [wptr1, w_col_stride2]\n"
- "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
- "str qV12B, [%x[vptr0], %x[v_col_stride]]\n"
- "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
- "ldr qU23A, [uptr1, u_col_stride2]\n"
- "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
- "ldr qW22A, [wptr1, w_col_stride1]\n"
- "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
- "ldr qU22A, [uptr1, u_col_stride1]\n"
- "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
- "ldr qW21A, [wptr1], #0x10\n"
- "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
- "ldr qU34A, [uptr2, u_col_stride3]\n"
- "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
- "ldr qW33A, [wptr2, w_col_stride2]\n"
- "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
- "str qV22B, [vptr1, %x[v_col_stride]]\n"
- "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
- "ldr qU33A, [uptr2, u_col_stride2]\n"
- "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
- "ldr qW32A, [wptr2, w_col_stride1]\n"
- "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
- "ldr qU32A, [uptr2, u_col_stride1]\n"
- "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
- "str qV11B, [%x[vptr0]], #0x10\n"
- "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
- "ldr qW31A, [wptr2], #0x10\n"
- "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
- "str qV21B, [vptr1], #0x10\n"
- "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
- "bne 1b\n" // Loop
-
- "2:" // Branch destination for zero loops
- "cbnz %w[odd_tail], 4f\n"
-
- "3:" // Even number of iterations
- "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
- "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
- "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
- "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
- "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
- "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
- "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
- "ldr qU44A, [uptr3, u_col_stride3]\n"
- "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
- "ldr qU43A, [uptr3, u_col_stride2]\n"
- "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
- "ldr qU42A, [uptr3, u_col_stride1]\n"
- "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
- "ldr qU11A, [%x[uptr0]], #0x10\n"
- "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
- "ldr qU21A, [uptr1], #0x10\n"
- "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
- "ldr qU31A, [uptr2], #0x10\n"
- "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
- "ldr qU41A, [uptr3], #0x10\n"
- "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
- "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
- "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
- "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
- "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
- "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
- "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
- "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
- "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
- "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
- "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
- "ldr qW11B, [%x[wptr0]], #0x10\n"
- "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
- "ldr qU24B, [uptr1, u_col_stride3]\n"
- "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
- "ldr qW23B, [wptr1, w_col_stride2]\n"
- "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
- "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
- "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
- "ldr qU23B, [uptr1, u_col_stride2]\n"
- "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
- "ldr qW22B, [wptr1, w_col_stride1]\n"
- "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
- "ldr qU22B, [uptr1, u_col_stride1]\n"
- "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
- "ldr qW21B, [wptr1], #0x10\n"
- "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
- "ldr qU34B, [uptr2, u_col_stride3]\n"
- "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
- "ldr qW33B, [wptr2, w_col_stride2]\n"
- "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
- "str qV22A, [vptr1, %x[v_col_stride]]\n"
- "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
- "ldr qU33B, [uptr2, u_col_stride2]\n"
- "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
- "ldr qW32B, [wptr2, w_col_stride1]\n"
- "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
- "ldr qU32B, [uptr2, u_col_stride1]\n"
- "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
- "str qV11A, [%x[vptr0]], #0x10\n"
- "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
- "ldr qW31B, [wptr2], #0x10\n"
- "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
- "str qV21A, [vptr1], #0x10\n"
-
- "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
- "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
- "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
- "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
- "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
- "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
- "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
- "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
- "ldr qU44B, [uptr3, u_col_stride3]\n"
- "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
- "ldr qU43B, [uptr3, u_col_stride2]\n"
- "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
- "ldr qU42B, [uptr3, u_col_stride1]\n"
- "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
- "ldr qU11B, [%x[uptr0]], #0x10\n"
- "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
- "ldr qU21B, [uptr1], #0x10\n"
- "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
- "ldr qU31B, [uptr2], #0x10\n"
- "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
- "ldr qU41B, [uptr3], #0x10\n"
- "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
- "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
- "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
- "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
- "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
- "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
- "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
- "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
- "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
- "str qV12B, [%x[vptr0], %x[v_col_stride]]\n"
- "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
- "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
- "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
- "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
- "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
- "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
- "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
- "str qV22B, [vptr1, %x[v_col_stride]]\n"
- "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
- "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
- "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
- "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
- "str qV11B, [%x[vptr0]], #0x10\n"
- "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
- "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
- "str qV21B, [vptr1], #0x10\n"
- "b 5f\n"
-
- "4:" // Odd number of iterations
- "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
- "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
- "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
- "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
- "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
- "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
- "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
- "ldr qU44A, [uptr3, u_col_stride3]\n"
- "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
- "ldr qU43A, [uptr3, u_col_stride2]\n"
- "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
- "ldr qU42A, [uptr3, u_col_stride1]\n"
- "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
- "ldr qU11A, [%x[uptr0]], #0x10\n"
- "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
- "ldr qU21A, [uptr1], #0x10\n"
- "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
- "ldr qU31A, [uptr2], #0x10\n"
- "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
- "ldr qU41A, [uptr3], #0x10\n"
- "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
- "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
- "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
- "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
- "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
- "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
- "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
- "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
- "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
- "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
- "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
- "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
- "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
- "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
- "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
- "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
- "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
- "str qV22A, [vptr1, %x[v_col_stride]]\n"
- "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
- "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
- "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
- "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
- "str qV11A, [%x[vptr0]], #0x10\n"
- "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
- "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
- "str qV21A, [vptr1], #0x10\n"
-
- "5:" // End of method
-
- ".unreq qW11B\n" ".unreq qW33A\n" ".unreq qU32B\n"
- ".unreq qU44B\n" ".unreq qW21A\n" ".unreq qU21B\n" ".unreq qU32A\n"
- ".unreq qU43A\n" ".unreq qV21B\n"
- ".unreq qU24A\n" ".unreq qU44A\n" ".unreq qU33B\n"
- ".unreq qU31A\n" ".unreq qV12B\n" ".unreq qU23A\n"
- ".unreq qW31B\n" ".unreq qV22A\n" ".unreq qV12A\n" ".unreq qW21B\n"
- ".unreq qU22B\n" ".unreq qU34A\n" ".unreq qU13B\n" ".unreq qU13A\n"
- ".unreq qU34B\n" ".unreq qU22A\n" ".unreq qU24B\n" ".unreq qU31B\n"
- ".unreq qW12B\n" ".unreq qW13A\n" ".unreq qV21A\n" ".unreq qV11B\n"
- ".unreq qW32A\n" ".unreq qW32B\n" ".unreq qW31A\n" ".unreq qV22B\n"
- ".unreq qW11A\n" ".unreq qW13B\n" ".unreq qU14A\n"
- ".unreq qU33A\n" ".unreq qW33B\n" ".unreq qW22A\n" ".unreq qU23B\n"
- ".unreq qU12A\n" ".unreq qU42A\n" ".unreq qU41A\n" ".unreq qU42B\n"
- ".unreq qW23A\n" ".unreq qW23B\n" ".unreq qU43B\n" ".unreq qU11A\n"
- ".unreq qU12B\n" ".unreq qW12A\n" ".unreq qU41B\n" ".unreq qV11A\n"
- ".unreq qW22B\n" ".unreq qU11B\n" ".unreq qU14B\n" ".unreq qU21A\n"
- ".unreq vW11B\n" ".unreq vW33A\n" ".unreq vU32B\n"
- ".unreq vU44B\n" ".unreq vW21A\n" ".unreq vU21B\n" ".unreq vU32A\n"
- ".unreq vU43A\n" ".unreq vV21B\n"
- ".unreq vU24A\n" ".unreq vU44A\n" ".unreq vU33B\n"
- ".unreq vU31A\n" ".unreq vV12B\n" ".unreq vU23A\n"
- ".unreq vW31B\n" ".unreq vV22A\n" ".unreq vV12A\n" ".unreq vW21B\n"
- ".unreq vU22B\n" ".unreq vU34A\n" ".unreq vU13B\n" ".unreq vU13A\n"
- ".unreq vU34B\n" ".unreq vU22A\n" ".unreq vU24B\n" ".unreq vU31B\n"
- ".unreq vW12B\n" ".unreq vW13A\n" ".unreq vV21A\n" ".unreq vV11B\n"
- ".unreq vW32A\n" ".unreq vW32B\n" ".unreq vW31A\n" ".unreq vV22B\n"
- ".unreq vW11A\n" ".unreq vW13B\n" ".unreq vU14A\n"
- ".unreq vU33A\n" ".unreq vW33B\n" ".unreq vW22A\n" ".unreq vU23B\n"
- ".unreq vU12A\n" ".unreq vU42A\n" ".unreq vU41A\n" ".unreq vU42B\n"
- ".unreq vW23A\n" ".unreq vW23B\n" ".unreq vU43B\n" ".unreq vU11A\n"
- ".unreq vU12B\n" ".unreq vW12A\n" ".unreq vU41B\n" ".unreq vV11A\n"
- ".unreq vW22B\n" ".unreq vU11B\n" ".unreq vU14B\n" ".unreq vU21A\n"
- ".unreq u_col_stride1\n" ".unreq u_col_stride2\n"
- ".unreq u_col_stride3\n"
- ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n"
- ".unreq wptr1\n" ".unreq wptr2\n" ".unreq vptr1\n"
- ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
-
- : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
- [iters] "+r" (n_iters)
- : [u_row_stride] "r" (in_row_stride * sizeof(float)),
- [u_col_stride] "r" (in_col_stride * sizeof(float)),
- [v_row_stride] "r" (out_row_stride * sizeof(float)),
- [v_col_stride] "r" (out_col_stride * sizeof(float)),
- [w_row_stride] "r" (weight_row_stride * sizeof(float)),
- [w_col_stride] "r" (weight_col_stride * sizeof(float)),
- [odd_tail] "r" (odd_tail)
- : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "cc",
- "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
- "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "memory"
- );
- }
-
- if (channels_remaining)
- {
- // Fall back on the unoptimised version to clean up the tail
- ConvImpl::process_tile<false>(
- channels_remaining,
- wptr0, weight_row_stride, weight_col_stride,
- uptr0, in_row_stride, in_col_stride,
- vptr0, out_row_stride, out_col_stride,
- 0, 0, 0, 0, 0, 0
- );
- }
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
+)
+{
+ __asm __volatile(
+ "add x21, %[inptr0], %[input_row_stride]\n"
+ "add x23, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x24, %[outptr0], %[output_row_stride]\n"
+ "add x27, x21, %[input_row_stride]\n"
+ "add x22, x23, %[input_col_stride1]\n"
+ "and x25, %[n_channels], #3\n"
+ "add x28, x27, %[input_row_stride]\n"
+ "lsr x26, %[n_channels], #2\n"
+ "cbz x26, 4f\n"
+ "1:\n"
+ "ldr q19, [%[wbptr]]\n"
+ "subs x26, x26, #1\n"
+ "mov v3.16b, v19.16b\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "mov v1.16b, v19.16b\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "mov v2.16b, v19.16b\n"
+ "ldr q10, [%[wbptr], #48]\n"
+ "mov v0.16b, v19.16b\n"
+ "ldr q13, [%[wbptr], #64]\n"
+ "ldr q23, [%[wbptr], #80]\n"
+ "ldr q15, [%[wbptr], #96]\n"
+ "ldr q20, [%[wbptr], #112]\n"
+ "ldr q21, [%[wbptr], #128]\n"
+ "ldr q14, [%[wbptr], #144]\n"
+ "ldr q16, [%[inptr0]]\n"
+ "fmla v3.4s, v16.4s, v12.4s\n"
+ "ldr q28, [x21]\n"
+ "fmla v1.4s, v28.4s, v12.4s\n"
+ "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "ldr q24, [x27]\n"
+ "fmla v3.4s, v28.4s, v13.4s\n"
+ "ldr q8, [x21, %[input_col_stride1]]\n"
+ "ldr q9, [%[inptr0], x23]\n"
+ "ldr q18, [x28]\n"
+ "ldr q6, [x27, %[input_col_stride1]]\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v3.4s, v24.4s, v20.4s\n"
+ "ldr q25, [x21, x23]\n"
+ "fmla v1.4s, v24.4s, v13.4s\n"
+ "ldr q28, [%[inptr0], x22]\n"
+ "fmla v2.4s, v8.4s, v13.4s\n"
+ "ldr q24, [x28, %[input_col_stride1]]\n"
+ "fmla v3.4s, v8.4s, v23.4s\n"
+ "ldr q27, [x27, x23]\n"
+ "fmla v1.4s, v8.4s, v11.4s\n"
+ "ldr q7, [x21, x22]\n"
+ "fmla v0.4s, v8.4s, v12.4s\n"
+ "ldr q17, [x28, x23]\n"
+ "fmla v3.4s, v9.4s, v10.4s\n"
+ "ldr q5, [x27, x22]\n"
+ "fmla v2.4s, v9.4s, v11.4s\n"
+ "ldr q4, [x28, x22]\n"
+ "fmla v1.4s, v18.4s, v20.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v3.4s, v6.4s, v21.4s\n"
+ "ldr q19, [%[wbptr]]\n"
+ "fmla v2.4s, v6.4s, v20.4s\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "fmla v1.4s, v6.4s, v23.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v6.4s, v13.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v3.4s, v25.4s, v15.4s\n"
+ "ldr q16, [%[inptr0]]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v25.4s, v23.4s\n"
+ "ldr q13, [%[wbptr], #64]\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "ldr q9, [%[inptr0], x23]\n"
+ "fmla v1.4s, v24.4s, v21.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v2.4s, v28.4s, v10.4s\n"
+ "ldr q28, [x21]\n"
+ "fmla v0.4s, v24.4s, v20.4s\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "fmla v3.4s, v27.4s, v14.4s\n"
+ "ldr q8, [x21, %[input_col_stride1]]\n"
+ "fmla v1.4s, v27.4s, v15.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v2.4s, v27.4s, v21.4s\n"
+ "ldr q20, [%[wbptr], #112]\n"
+ "fmla v0.4s, v27.4s, v23.4s\n"
+ "ldr q24, [x27]\n"
+ "fmla v1.4s, v17.4s, v14.4s\n"
+ "ldr q6, [x27, %[input_col_stride1]]\n"
+ "fmla v2.4s, v7.4s, v15.4s\n"
+ "add x28, x28, #16\n"
+ "fmla v0.4s, v7.4s, v10.4s\n"
+ "ldr q23, [%[wbptr], #80]\n"
+ "movi v25.16b, #0\n"
+ "ldr q18, [x28]\n"
+ "fmla v2.4s, v5.4s, v14.4s\n"
+ "subs x26, x26, #1\n"
+ "fmla v0.4s, v17.4s, v21.4s\n"
+ "ldr q10, [%[wbptr], #48]\n"
+ "fmov v26.4s, #6.0\n"
+ "fmax v3.4s, v3.4s, v25.4s\n"
+ "fmax v2.4s, v2.4s, v25.4s\n"
+ "fmax v1.4s, v1.4s, v25.4s\n"
+ "fmla v0.4s, v5.4s, v15.4s\n"
+ "ldr q21, [%[wbptr], #128]\n"
+ "fmin v3.4s, v3.4s, v26.4s\n"
+ "fmin v2.4s, v2.4s, v26.4s\n"
+ "fmin v1.4s, v1.4s, v26.4s\n"
+ "str q3, [%[outptr0]]\n"
+ "str q2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v0.4s, v4.4s, v14.4s\n"
+ "str q1, [x24]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v1.16b, v19.16b\n"
+ "ldr q15, [%[wbptr], #96]\n"
+ "fmax v0.4s, v0.4s, v25.4s\n"
+ "ldr q14, [%[wbptr], #144]\n"
+ "mov v2.16b, v19.16b\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmin v0.4s, v0.4s, v26.4s\n"
+ "fmla v3.4s, v16.4s, v12.4s\n"
+ "fmla v1.4s, v28.4s, v12.4s\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "str q0, [x24, %[output_col_stride1]]\n"
+ "mov v0.16b, v19.16b\n"
+ "fmla v3.4s, v28.4s, v13.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v3.4s, v24.4s, v20.4s\n"
+ "ldr q25, [x21, x23]\n"
+ "fmla v1.4s, v24.4s, v13.4s\n"
+ "ldr q28, [%[inptr0], x22]\n"
+ "fmla v2.4s, v8.4s, v13.4s\n"
+ "ldr q24, [x28, %[input_col_stride1]]\n"
+ "fmla v3.4s, v8.4s, v23.4s\n"
+ "ldr q27, [x27, x23]\n"
+ "fmla v1.4s, v8.4s, v11.4s\n"
+ "ldr q7, [x21, x22]\n"
+ "fmla v0.4s, v8.4s, v12.4s\n"
+ "ldr q17, [x28, x23]\n"
+ "fmla v3.4s, v9.4s, v10.4s\n"
+ "ldr q5, [x27, x22]\n"
+ "fmla v2.4s, v9.4s, v11.4s\n"
+ "ldr q4, [x28, x22]\n"
+ "fmla v1.4s, v18.4s, v20.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v3.4s, v6.4s, v21.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v2.4s, v6.4s, v20.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v1.4s, v6.4s, v23.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v0.4s, v6.4s, v13.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v3.4s, v25.4s, v15.4s\n"
+ "add x28, x28, #16\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v23.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "movi v25.16b, #0\n"
+ "fmla v3.4s, v27.4s, v14.4s\n"
+ "fmov v26.4s, #6.0\n"
+ "fmla v2.4s, v28.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v21.4s\n"
+ "fmla v0.4s, v24.4s, v20.4s\n"
+ "fmax v3.4s, v3.4s, v25.4s\n"
+ "fmla v1.4s, v27.4s, v15.4s\n"
+ "fmla v2.4s, v27.4s, v21.4s\n"
+ "fmla v0.4s, v27.4s, v23.4s\n"
+ "fmin v3.4s, v3.4s, v26.4s\n"
+ "str q3, [%[outptr0]]\n"
+ "fmla v2.4s, v7.4s, v15.4s\n"
+ "fmla v0.4s, v7.4s, v10.4s\n"
+ "fmla v1.4s, v17.4s, v14.4s\n"
+ "fmla v2.4s, v5.4s, v14.4s\n"
+ "fmla v0.4s, v17.4s, v21.4s\n"
+ "fmax v1.4s, v1.4s, v25.4s\n"
+ "fmax v2.4s, v2.4s, v25.4s\n"
+ "fmla v0.4s, v5.4s, v15.4s\n"
+ "fmin v1.4s, v1.4s, v26.4s\n"
+ "fmin v2.4s, v2.4s, v26.4s\n"
+ "str q1, [x24]\n"
+ "str q2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v0.4s, v4.4s, v14.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmax v0.4s, v0.4s, v25.4s\n"
+ "fmin v0.4s, v0.4s, v26.4s\n"
+ "str q0, [x24, %[output_col_stride1]]\n"
+ "add x24, x24, #16\n"
+ "4:\n"
+ "cbz x25, 7f\n"
+ "ldr s19, [%[wbptr]]\n"
+ "mov v3.16b, v19.16b\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "mov v1.16b, v19.16b\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "mov v2.16b, v19.16b\n"
+ "ldr s10, [%[wbptr], #12]\n"
+ "mov v0.16b, v19.16b\n"
+ "ldr s13, [%[wbptr], #16]\n"
+ "ldr s23, [%[wbptr], #20]\n"
+ "subs x25, x25, #1\n"
+ "ldr s15, [%[wbptr], #24]\n"
+ "ldr s20, [%[wbptr], #28]\n"
+ "ldr s21, [%[wbptr], #32]\n"
+ "ldr s14, [%[wbptr], #36]\n"
+ "ldr s16, [%[inptr0]]\n"
+ "ldr s28, [x21]\n"
+ "fmla v3.4s, v16.4s, v12.4s\n"
+ "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v1.4s, v28.4s, v12.4s\n"
+ "ldr s24, [x27]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "ldr s8, [x21, %[input_col_stride1]]\n"
+ "fmla v3.4s, v28.4s, v13.4s\n"
+ "ldr s9, [%[inptr0], x23]\n"
+ "ldr s18, [x28]\n"
+ "ldr s6, [x27, %[input_col_stride1]]\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v3.4s, v24.4s, v20.4s\n"
+ "ldr s25, [x21, x23]\n"
+ "fmla v1.4s, v24.4s, v13.4s\n"
+ "ldr s28, [%[inptr0], x22]\n"
+ "fmla v2.4s, v8.4s, v13.4s\n"
+ "ldr s24, [x28, %[input_col_stride1]]\n"
+ "fmla v3.4s, v8.4s, v23.4s\n"
+ "ldr s27, [x27, x23]\n"
+ "fmla v1.4s, v8.4s, v11.4s\n"
+ "ldr s7, [x21, x22]\n"
+ "fmla v0.4s, v8.4s, v12.4s\n"
+ "ldr s17, [x28, x23]\n"
+ "fmla v3.4s, v9.4s, v10.4s\n"
+ "ldr s5, [x27, x22]\n"
+ "fmla v2.4s, v9.4s, v11.4s\n"
+ "ldr s4, [x28, x22]\n"
+ "fmla v1.4s, v18.4s, v20.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v3.4s, v6.4s, v21.4s\n"
+ "ldr s19, [%[wbptr]]\n"
+ "fmla v2.4s, v6.4s, v20.4s\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "fmla v1.4s, v6.4s, v23.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v6.4s, v13.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v3.4s, v25.4s, v15.4s\n"
+ "ldr s16, [%[inptr0]]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v25.4s, v23.4s\n"
+ "ldr s13, [%[wbptr], #16]\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "ldr s9, [%[inptr0], x23]\n"
+ "fmla v1.4s, v24.4s, v21.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v2.4s, v28.4s, v10.4s\n"
+ "ldr s28, [x21]\n"
+ "fmla v0.4s, v24.4s, v20.4s\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "fmla v3.4s, v27.4s, v14.4s\n"
+ "ldr s8, [x21, %[input_col_stride1]]\n"
+ "fmla v1.4s, v27.4s, v15.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v2.4s, v27.4s, v21.4s\n"
+ "ldr s20, [%[wbptr], #28]\n"
+ "fmla v0.4s, v27.4s, v23.4s\n"
+ "ldr s24, [x27]\n"
+ "fmla v1.4s, v17.4s, v14.4s\n"
+ "ldr s6, [x27, %[input_col_stride1]]\n"
+ "fmla v2.4s, v7.4s, v15.4s\n"
+ "add x28, x28, #4\n"
+ "fmla v0.4s, v7.4s, v10.4s\n"
+ "ldr s23, [%[wbptr], #20]\n"
+ "movi v25.16b, #0\n"
+ "ldr s18, [x28]\n"
+ "fmla v2.4s, v5.4s, v14.4s\n"
+ "subs x25, x25, #1\n"
+ "fmla v0.4s, v17.4s, v21.4s\n"
+ "ldr s10, [%[wbptr], #12]\n"
+ "fmov v26.4s, #6.0\n"
+ "fmax v3.4s, v3.4s, v25.4s\n"
+ "fmax v2.4s, v2.4s, v25.4s\n"
+ "fmax v1.4s, v1.4s, v25.4s\n"
+ "fmla v0.4s, v5.4s, v15.4s\n"
+ "ldr s21, [%[wbptr], #32]\n"
+ "fmin v3.4s, v3.4s, v26.4s\n"
+ "fmin v2.4s, v2.4s, v26.4s\n"
+ "fmin v1.4s, v1.4s, v26.4s\n"
+ "str s3, [%[outptr0]]\n"
+ "str s2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v0.4s, v4.4s, v14.4s\n"
+ "str s1, [x24]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v1.16b, v19.16b\n"
+ "ldr s15, [%[wbptr], #24]\n"
+ "fmax v0.4s, v0.4s, v25.4s\n"
+ "ldr s14, [%[wbptr], #36]\n"
+ "mov v2.16b, v19.16b\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmin v0.4s, v0.4s, v26.4s\n"
+ "fmla v3.4s, v16.4s, v12.4s\n"
+ "fmla v1.4s, v28.4s, v12.4s\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "str s0, [x24, %[output_col_stride1]]\n"
+ "mov v0.16b, v19.16b\n"
+ "fmla v3.4s, v28.4s, v13.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v3.4s, v24.4s, v20.4s\n"
+ "ldr s25, [x21, x23]\n"
+ "fmla v1.4s, v24.4s, v13.4s\n"
+ "ldr s28, [%[inptr0], x22]\n"
+ "fmla v2.4s, v8.4s, v13.4s\n"
+ "ldr s24, [x28, %[input_col_stride1]]\n"
+ "fmla v3.4s, v8.4s, v23.4s\n"
+ "ldr s27, [x27, x23]\n"
+ "fmla v1.4s, v8.4s, v11.4s\n"
+ "ldr s7, [x21, x22]\n"
+ "fmla v0.4s, v8.4s, v12.4s\n"
+ "ldr s17, [x28, x23]\n"
+ "fmla v3.4s, v9.4s, v10.4s\n"
+ "ldr s5, [x27, x22]\n"
+ "fmla v2.4s, v9.4s, v11.4s\n"
+ "ldr s4, [x28, x22]\n"
+ "fmla v1.4s, v18.4s, v20.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v3.4s, v6.4s, v21.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v2.4s, v6.4s, v20.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v1.4s, v6.4s, v23.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v0.4s, v6.4s, v13.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v3.4s, v25.4s, v15.4s\n"
+ "add x28, x28, #4\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v23.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "movi v25.16b, #0\n"
+ "fmla v3.4s, v27.4s, v14.4s\n"
+ "fmov v26.4s, #6.0\n"
+ "fmla v2.4s, v28.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v21.4s\n"
+ "fmla v0.4s, v24.4s, v20.4s\n"
+ "fmax v3.4s, v3.4s, v25.4s\n"
+ "fmla v1.4s, v27.4s, v15.4s\n"
+ "fmla v2.4s, v27.4s, v21.4s\n"
+ "fmla v0.4s, v27.4s, v23.4s\n"
+ "fmin v3.4s, v3.4s, v26.4s\n"
+ "str s3, [%[outptr0]]\n"
+ "fmla v2.4s, v7.4s, v15.4s\n"
+ "fmla v0.4s, v7.4s, v10.4s\n"
+ "fmla v1.4s, v17.4s, v14.4s\n"
+ "fmla v2.4s, v5.4s, v14.4s\n"
+ "fmla v0.4s, v17.4s, v21.4s\n"
+ "fmax v1.4s, v1.4s, v25.4s\n"
+ "fmax v2.4s, v2.4s, v25.4s\n"
+ "fmla v0.4s, v5.4s, v15.4s\n"
+ "fmin v1.4s, v1.4s, v26.4s\n"
+ "fmin v2.4s, v2.4s, v26.4s\n"
+ "str s1, [x24]\n"
+ "str s2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v0.4s, v4.4s, v14.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmax v0.4s, v0.4s, v25.4s\n"
+ "fmin v0.4s, v0.4s, v26.4s\n"
+ "str s0, [x24, %[output_col_stride1]]\n"
+ "add x24, x24, #4\n"
+ "7:\n"
+ : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
+ : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
}
#endif // __aarch64__
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
index 9ce43f9..010dd81 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,74 +25,2785 @@
namespace depthwise
{
-using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 2, 2, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
+
+#ifdef __aarch64__
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::None>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
+)
+{
+ __asm __volatile(
+ "add x23, %[inptr0], %[input_row_stride]\n"
+ "add x19, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x22, %[outptr0], %[output_row_stride]\n"
+ "add x24, x23, %[input_row_stride]\n"
+ "add x20, x19, %[input_col_stride1]\n"
+ "and x27, %[n_channels], #3\n"
+ "add x25, x24, %[input_row_stride]\n"
+ "add x21, x20, %[input_col_stride1]\n"
+ "lsr x28, %[n_channels], #2\n"
+ "add x26, x25, %[input_row_stride]\n"
+ "cbz x28, 4f\n"
+ "1:\n"
+ "ldr q14, [%[wbptr]]\n"
+ "subs x28, x28, #1\n"
+ "mov v12.16b, v14.16b\n"
+ "ldr q8, [%[wbptr], #16]\n"
+ "mov v10.16b, v14.16b\n"
+ "ldr q7, [%[wbptr], #32]\n"
+ "mov v11.16b, v14.16b\n"
+ "ldr q6, [%[wbptr], #48]\n"
+ "mov v9.16b, v14.16b\n"
+ "ldr q5, [%[wbptr], #64]\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "ldr q15, [%[inptr0]]\n"
+ "fmla v12.4s, v15.4s, v8.4s\n"
+ "ldr q20, [x23]\n"
+ "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr q17, [x24]\n"
+ "fmla v10.4s, v17.4s, v8.4s\n"
+ "ldr q16, [x23, %[input_col_stride1]]\n"
+ "fmla v12.4s, v20.4s, v5.4s\n"
+ "ldr q18, [%[inptr0], x19]\n"
+ "ldr q14, [x25]\n"
+ "ldr q15, [x24, %[input_col_stride1]]\n"
+ "fmla v12.4s, v13.4s, v7.4s\n"
+ "fmla v12.4s, v17.4s, v2.4s\n"
+ "fmla v12.4s, v16.4s, v4.4s\n"
+ "fmla v12.4s, v18.4s, v6.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v11.4s, v18.4s, v8.4s\n"
+ "ldr q19, [x23, x19]\n"
+ "fmla v10.4s, v14.4s, v5.4s\n"
+ "ldr q20, [%[inptr0], x20]\n"
+ "fmla v12.4s, v15.4s, v1.4s\n"
+ "ldr q14, [x26]\n"
+ "fmla v11.4s, v19.4s, v5.4s\n"
+ "ldr q13, [x25, %[input_col_stride1]]\n"
+ "fmla v10.4s, v15.4s, v7.4s\n"
+ "ldr q17, [x24, x19]\n"
+ "fmla v12.4s, v19.4s, v3.4s\n"
+ "ldr q19, [x23, x20]\n"
+ "fmla v11.4s, v20.4s, v7.4s\n"
+ "ldr q18, [%[inptr0], x21]\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "ldr q16, [x26, %[input_col_stride1]]\n"
+ "fmla v12.4s, v17.4s, v0.4s\n"
+ "ldr q14, [x25, x19]\n"
+ "fmla v11.4s, v17.4s, v2.4s\n"
+ "ldr q15, [x24, x20]\n"
+ "fmla v10.4s, v13.4s, v4.4s\n"
+ "ldr q13, [x23, x21]\n"
+ "str q12, [%[outptr0]]\n"
+ "fmla v9.4s, v17.4s, v8.4s\n"
+ "fmla v11.4s, v19.4s, v4.4s\n"
+ "ldr q12, [x26, x19]\n"
+ "fmla v10.4s, v17.4s, v6.4s\n"
+ "ldr q20, [x25, x20]\n"
+ "fmla v9.4s, v14.4s, v5.4s\n"
+ "ldr q17, [x24, x21]\n"
+ "fmla v11.4s, v18.4s, v6.4s\n"
+ "ldr q19, [x26, x20]\n"
+ "fmla v10.4s, v16.4s, v1.4s\n"
+ "ldr q18, [x25, x21]\n"
+ "fmla v9.4s, v15.4s, v7.4s\n"
+ "ldr q16, [x26, x21]\n"
+ "fmla v11.4s, v15.4s, v1.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr q14, [%[wbptr]]\n"
+ "fmla v9.4s, v12.4s, v2.4s\n"
+ "ldr q8, [%[wbptr], #16]\n"
+ "fmla v11.4s, v13.4s, v3.4s\n"
+ "ldr q7, [%[wbptr], #32]\n"
+ "fmla v10.4s, v12.4s, v0.4s\n"
+ "ldr q5, [%[wbptr], #64]\n"
+ "fmla v9.4s, v20.4s, v4.4s\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "fmla v11.4s, v17.4s, v0.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "str q10, [x22]\n"
+ "mov v12.16b, v14.16b\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "str q11, [%[outptr0], %[output_col_stride1]]\n"
+ "mov v10.16b, v14.16b\n"
+ "mov v11.16b, v14.16b\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr q6, [%[wbptr], #48]\n"
+ "ldr q15, [%[inptr0]]\n"
+ "add x23, x23, #16\n"
+ "fmla v12.4s, v15.4s, v8.4s\n"
+ "ldr q20, [x23]\n"
+ "fmla v9.4s, v18.4s, v3.4s\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "ldr q13, [%[inptr0], %[input_col_stride1]]\n"
+ "add x24, x24, #16\n"
+ "fmla v12.4s, v20.4s, v5.4s\n"
+ "ldr q17, [x24]\n"
+ "fmla v9.4s, v16.4s, v0.4s\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "fmla v10.4s, v17.4s, v8.4s\n"
+ "ldr q16, [x23, %[input_col_stride1]]\n"
+ "fmla v12.4s, v13.4s, v7.4s\n"
+ "ldr q18, [%[inptr0], x19]\n"
+ "str q9, [x22, %[output_col_stride1]]\n"
+ "add x25, x25, #16\n"
+ "mov v9.16b, v14.16b\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "fmla v12.4s, v17.4s, v2.4s\n"
+ "ldr q14, [x25]\n"
+ "ldr q15, [x24, %[input_col_stride1]]\n"
+ "add x26, x26, #16\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "add x22, x22, #16\n"
+ "subs x28, x28, #1\n"
+ "fmla v12.4s, v16.4s, v4.4s\n"
+ "fmla v12.4s, v18.4s, v6.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v11.4s, v18.4s, v8.4s\n"
+ "ldr q19, [x23, x19]\n"
+ "fmla v10.4s, v14.4s, v5.4s\n"
+ "ldr q20, [%[inptr0], x20]\n"
+ "fmla v12.4s, v15.4s, v1.4s\n"
+ "ldr q14, [x26]\n"
+ "fmla v11.4s, v19.4s, v5.4s\n"
+ "ldr q13, [x25, %[input_col_stride1]]\n"
+ "fmla v10.4s, v15.4s, v7.4s\n"
+ "ldr q17, [x24, x19]\n"
+ "fmla v12.4s, v19.4s, v3.4s\n"
+ "ldr q19, [x23, x20]\n"
+ "fmla v11.4s, v20.4s, v7.4s\n"
+ "ldr q18, [%[inptr0], x21]\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "ldr q16, [x26, %[input_col_stride1]]\n"
+ "fmla v12.4s, v17.4s, v0.4s\n"
+ "ldr q14, [x25, x19]\n"
+ "fmla v11.4s, v17.4s, v2.4s\n"
+ "ldr q15, [x24, x20]\n"
+ "fmla v10.4s, v13.4s, v4.4s\n"
+ "ldr q13, [x23, x21]\n"
+ "str q12, [%[outptr0]]\n"
+ "fmla v9.4s, v17.4s, v8.4s\n"
+ "fmla v11.4s, v19.4s, v4.4s\n"
+ "ldr q12, [x26, x19]\n"
+ "fmla v10.4s, v17.4s, v6.4s\n"
+ "ldr q20, [x25, x20]\n"
+ "fmla v9.4s, v14.4s, v5.4s\n"
+ "ldr q17, [x24, x21]\n"
+ "fmla v11.4s, v18.4s, v6.4s\n"
+ "ldr q19, [x26, x20]\n"
+ "fmla v10.4s, v16.4s, v1.4s\n"
+ "ldr q18, [x25, x21]\n"
+ "fmla v9.4s, v15.4s, v7.4s\n"
+ "ldr q16, [x26, x21]\n"
+ "fmla v11.4s, v15.4s, v1.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v9.4s, v12.4s, v2.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v11.4s, v13.4s, v3.4s\n"
+ "add x23, x23, #16\n"
+ "fmla v10.4s, v12.4s, v0.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v9.4s, v20.4s, v4.4s\n"
+ "add x25, x25, #16\n"
+ "fmla v11.4s, v17.4s, v0.4s\n"
+ "add x26, x26, #16\n"
+ "str q10, [x22]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "str q11, [%[outptr0], %[output_col_stride1]]\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "fmla v9.4s, v18.4s, v3.4s\n"
+ "fmla v9.4s, v16.4s, v0.4s\n"
+ "str q9, [x22, %[output_col_stride1]]\n"
+ "add x22, x22, #16\n"
+ "4:\n"
+ "cbz x27, 7f\n"
+ "ldr s14, [%[wbptr]]\n"
+ "mov v12.16b, v14.16b\n"
+ "ldr s8, [%[wbptr], #4]\n"
+ "mov v10.16b, v14.16b\n"
+ "ldr s7, [%[wbptr], #8]\n"
+ "mov v11.16b, v14.16b\n"
+ "ldr s6, [%[wbptr], #12]\n"
+ "mov v9.16b, v14.16b\n"
+ "ldr s5, [%[wbptr], #16]\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "subs x27, x27, #1\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "ldr s15, [%[inptr0]]\n"
+ "ldr s20, [x23]\n"
+ "fmla v12.4s, v15.4s, v8.4s\n"
+ "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr s17, [x24]\n"
+ "ldr s16, [x23, %[input_col_stride1]]\n"
+ "fmla v10.4s, v17.4s, v8.4s\n"
+ "ldr s18, [%[inptr0], x19]\n"
+ "fmla v12.4s, v20.4s, v5.4s\n"
+ "ldr s14, [x25]\n"
+ "ldr s15, [x24, %[input_col_stride1]]\n"
+ "fmla v12.4s, v13.4s, v7.4s\n"
+ "fmla v12.4s, v17.4s, v2.4s\n"
+ "fmla v12.4s, v16.4s, v4.4s\n"
+ "fmla v12.4s, v18.4s, v6.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v11.4s, v18.4s, v8.4s\n"
+ "ldr s19, [x23, x19]\n"
+ "fmla v10.4s, v14.4s, v5.4s\n"
+ "ldr s20, [%[inptr0], x20]\n"
+ "fmla v12.4s, v15.4s, v1.4s\n"
+ "ldr s14, [x26]\n"
+ "fmla v11.4s, v19.4s, v5.4s\n"
+ "ldr s13, [x25, %[input_col_stride1]]\n"
+ "fmla v10.4s, v15.4s, v7.4s\n"
+ "ldr s17, [x24, x19]\n"
+ "fmla v12.4s, v19.4s, v3.4s\n"
+ "ldr s19, [x23, x20]\n"
+ "fmla v11.4s, v20.4s, v7.4s\n"
+ "ldr s18, [%[inptr0], x21]\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "ldr s16, [x26, %[input_col_stride1]]\n"
+ "fmla v12.4s, v17.4s, v0.4s\n"
+ "ldr s14, [x25, x19]\n"
+ "fmla v11.4s, v17.4s, v2.4s\n"
+ "ldr s15, [x24, x20]\n"
+ "fmla v10.4s, v13.4s, v4.4s\n"
+ "ldr s13, [x23, x21]\n"
+ "str s12, [%[outptr0]]\n"
+ "fmla v9.4s, v17.4s, v8.4s\n"
+ "fmla v11.4s, v19.4s, v4.4s\n"
+ "ldr s12, [x26, x19]\n"
+ "fmla v10.4s, v17.4s, v6.4s\n"
+ "ldr s20, [x25, x20]\n"
+ "fmla v9.4s, v14.4s, v5.4s\n"
+ "ldr s17, [x24, x21]\n"
+ "fmla v11.4s, v18.4s, v6.4s\n"
+ "ldr s19, [x26, x20]\n"
+ "fmla v10.4s, v16.4s, v1.4s\n"
+ "ldr s18, [x25, x21]\n"
+ "fmla v9.4s, v15.4s, v7.4s\n"
+ "ldr s16, [x26, x21]\n"
+ "fmla v11.4s, v15.4s, v1.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr s14, [%[wbptr]]\n"
+ "fmla v9.4s, v12.4s, v2.4s\n"
+ "ldr s8, [%[wbptr], #4]\n"
+ "fmla v11.4s, v13.4s, v3.4s\n"
+ "ldr s7, [%[wbptr], #8]\n"
+ "fmla v10.4s, v12.4s, v0.4s\n"
+ "ldr s5, [%[wbptr], #16]\n"
+ "fmla v9.4s, v20.4s, v4.4s\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "fmla v11.4s, v17.4s, v0.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "str s10, [x22]\n"
+ "mov v12.16b, v14.16b\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "str s11, [%[outptr0], %[output_col_stride1]]\n"
+ "mov v10.16b, v14.16b\n"
+ "mov v11.16b, v14.16b\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr s6, [%[wbptr], #12]\n"
+ "ldr s15, [%[inptr0]]\n"
+ "add x23, x23, #4\n"
+ "fmla v12.4s, v15.4s, v8.4s\n"
+ "ldr s20, [x23]\n"
+ "fmla v9.4s, v18.4s, v3.4s\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "ldr s13, [%[inptr0], %[input_col_stride1]]\n"
+ "add x24, x24, #4\n"
+ "fmla v12.4s, v20.4s, v5.4s\n"
+ "ldr s17, [x24]\n"
+ "fmla v9.4s, v16.4s, v0.4s\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "fmla v10.4s, v17.4s, v8.4s\n"
+ "ldr s16, [x23, %[input_col_stride1]]\n"
+ "fmla v12.4s, v13.4s, v7.4s\n"
+ "ldr s18, [%[inptr0], x19]\n"
+ "str s9, [x22, %[output_col_stride1]]\n"
+ "add x25, x25, #4\n"
+ "mov v9.16b, v14.16b\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "fmla v12.4s, v17.4s, v2.4s\n"
+ "ldr s14, [x25]\n"
+ "ldr s15, [x24, %[input_col_stride1]]\n"
+ "add x26, x26, #4\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "add x22, x22, #4\n"
+ "subs x27, x27, #1\n"
+ "fmla v12.4s, v16.4s, v4.4s\n"
+ "fmla v12.4s, v18.4s, v6.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v11.4s, v18.4s, v8.4s\n"
+ "ldr s19, [x23, x19]\n"
+ "fmla v10.4s, v14.4s, v5.4s\n"
+ "ldr s20, [%[inptr0], x20]\n"
+ "fmla v12.4s, v15.4s, v1.4s\n"
+ "ldr s14, [x26]\n"
+ "fmla v11.4s, v19.4s, v5.4s\n"
+ "ldr s13, [x25, %[input_col_stride1]]\n"
+ "fmla v10.4s, v15.4s, v7.4s\n"
+ "ldr s17, [x24, x19]\n"
+ "fmla v12.4s, v19.4s, v3.4s\n"
+ "ldr s19, [x23, x20]\n"
+ "fmla v11.4s, v20.4s, v7.4s\n"
+ "ldr s18, [%[inptr0], x21]\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "ldr s16, [x26, %[input_col_stride1]]\n"
+ "fmla v12.4s, v17.4s, v0.4s\n"
+ "ldr s14, [x25, x19]\n"
+ "fmla v11.4s, v17.4s, v2.4s\n"
+ "ldr s15, [x24, x20]\n"
+ "fmla v10.4s, v13.4s, v4.4s\n"
+ "ldr s13, [x23, x21]\n"
+ "str s12, [%[outptr0]]\n"
+ "fmla v9.4s, v17.4s, v8.4s\n"
+ "fmla v11.4s, v19.4s, v4.4s\n"
+ "ldr s12, [x26, x19]\n"
+ "fmla v10.4s, v17.4s, v6.4s\n"
+ "ldr s20, [x25, x20]\n"
+ "fmla v9.4s, v14.4s, v5.4s\n"
+ "ldr s17, [x24, x21]\n"
+ "fmla v11.4s, v18.4s, v6.4s\n"
+ "ldr s19, [x26, x20]\n"
+ "fmla v10.4s, v16.4s, v1.4s\n"
+ "ldr s18, [x25, x21]\n"
+ "fmla v9.4s, v15.4s, v7.4s\n"
+ "ldr s16, [x26, x21]\n"
+ "fmla v11.4s, v15.4s, v1.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v9.4s, v12.4s, v2.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v11.4s, v13.4s, v3.4s\n"
+ "add x23, x23, #4\n"
+ "fmla v10.4s, v12.4s, v0.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v9.4s, v20.4s, v4.4s\n"
+ "add x25, x25, #4\n"
+ "fmla v11.4s, v17.4s, v0.4s\n"
+ "add x26, x26, #4\n"
+ "str s10, [x22]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "str s11, [%[outptr0], %[output_col_stride1]]\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "fmla v9.4s, v18.4s, v3.4s\n"
+ "fmla v9.4s, v16.4s, v0.4s\n"
+ "str s9, [x22, %[output_col_stride1]]\n"
+ "add x22, x22, #4\n"
+ "7:\n"
+ : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
+ : [n_channels] "r" ((long) n_channels), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template <>
+void Conv::execute_tile<ActivationFunction::None>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ __asm __volatile(
+ "mov x23, xzr\n"
+ "mov x24, xzr\n"
+ "and x25, %[n_channels], #3\n"
+ "lsr x26, %[n_channels], #2\n"
+ "cbz x26, 4f\n"
+ "1:\n"
+ "ldr q13, [%[wbptr]]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "mov v8.16b, v13.16b\n"
+ "ldr q6, [%[wbptr], #32]\n"
+ "mov v9.16b, v13.16b\n"
+ "ldr q5, [%[wbptr], #48]\n"
+ "mov v7.16b, v13.16b\n"
+ "ldr q11, [%[wbptr], #64]\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "ldr x27, [%[inptrs], 120]\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "subs x26, x26, #1\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "ldr q14, [x19, x23]\n"
+ "fmla v10.4s, v14.4s, v12.4s\n"
+ "ldr q18, [x20, x23]\n"
+ "ldr q14, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "ldr q19, [x19, x23]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "fmla v10.4s, v18.4s, v11.4s\n"
+ "ldr q15, [x20, x23]\n"
+ "ldr q18, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "ldr q13, [x19, x23]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v8.4s, v14.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v10.4s, v15.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v9.4s, v13.4s, v12.4s\n"
+ "ldr q14, [x20, x23]\n"
+ "ldr q17, [x19, x23]\n"
+ "ldr x22, [%[inptrs], 160]\n"
+ "fmla v8.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 128]\n"
+ "fmla v10.4s, v13.4s, v5.4s\n"
+ "ldr q15, [x22, x23]\n"
+ "fmla v9.4s, v14.4s, v11.4s\n"
+ "ldr q19, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v8.4s, v18.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 168]\n"
+ "fmla v10.4s, v18.4s, v1.4s\n"
+ "ldr q13, [x21, x23]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr q18, [x20, x23]\n"
+ "fmla v7.4s, v13.4s, v12.4s\n"
+ "ldr q17, [x19, x23]\n"
+ "fmla v8.4s, v15.4s, v2.4s\n"
+ "ldr q15, [x22, x23]\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr x27, [%[inptrs], 136]\n"
+ "fmla v9.4s, v13.4s, v2.4s\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v8.4s, v19.4s, v4.4s\n"
+ "ldr q19, [x21, x23]\n"
+ "fmla v10.4s, v13.4s, v0.4s\n"
+ "ldr q12, [x20, x23]\n"
+ "fmla v9.4s, v18.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 176]\n"
+ "fmla v7.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 144]\n"
+ "fmla v8.4s, v13.4s, v5.4s\n"
+ "ldr q11, [x22, x23]\n"
+ "ldr q13, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v9.4s, v17.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 184]\n"
+ "fmla v7.4s, v19.4s, v6.4s\n"
+ "ldr q14, [x21, x23]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr q17, [x22, x23]\n"
+ "ldr x27, [%[inptrs], 152]\n"
+ "ldr x22, [%[inptrs], 192]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str q10, [x21, x24]\n"
+ "fmla v7.4s, v11.4s, v2.4s\n"
+ "fmla v8.4s, v16.4s, v3.4s\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr q15, [x22, x23]\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v9.4s, v12.4s, v3.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v7.4s, v13.4s, v4.4s\n"
+ "ldr q13, [%[wbptr]]\n"
+ "fmla v8.4s, v11.4s, v0.4s\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr q6, [%[wbptr], #32]\n"
+ "fmla v9.4s, v14.4s, v0.4s\n"
+ "ldr q11, [%[wbptr], #64]\n"
+ "fmla v7.4s, v14.4s, v5.4s\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "str q8, [x28, x24]\n"
+ "add x23, x23, #16\n"
+ "mov v8.16b, v13.16b\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "str q9, [x21, x24]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "fmla v7.4s, v17.4s, v1.4s\n"
+ "ldr q5, [%[wbptr], #48]\n"
+ "mov v9.16b, v13.16b\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr x27, [%[inptrs], 120]\n"
+ "subs x26, x26, #1\n"
+ "fmla v7.4s, v16.4s, v3.4s\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "ldr q14, [x19, x23]\n"
+ "fmla v10.4s, v14.4s, v12.4s\n"
+ "ldr q18, [x20, x23]\n"
+ "ldr q14, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "fmla v7.4s, v15.4s, v0.4s\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "ldr q19, [x19, x23]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "fmla v10.4s, v18.4s, v11.4s\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr q15, [x20, x23]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "str q7, [x28, x24]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "mov v7.16b, v13.16b\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "ldr q13, [x19, x23]\n"
+ "ldr q18, [x21, x23]\n"
+ "add x24, x24, #16\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v8.4s, v14.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v10.4s, v15.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v9.4s, v13.4s, v12.4s\n"
+ "ldr q14, [x20, x23]\n"
+ "ldr q17, [x19, x23]\n"
+ "ldr x22, [%[inptrs], 160]\n"
+ "fmla v8.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 128]\n"
+ "fmla v10.4s, v13.4s, v5.4s\n"
+ "ldr q15, [x22, x23]\n"
+ "fmla v9.4s, v14.4s, v11.4s\n"
+ "ldr q19, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v8.4s, v18.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 168]\n"
+ "fmla v10.4s, v18.4s, v1.4s\n"
+ "ldr q13, [x21, x23]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr q18, [x20, x23]\n"
+ "fmla v7.4s, v13.4s, v12.4s\n"
+ "ldr q17, [x19, x23]\n"
+ "fmla v8.4s, v15.4s, v2.4s\n"
+ "ldr q15, [x22, x23]\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr x27, [%[inptrs], 136]\n"
+ "fmla v9.4s, v13.4s, v2.4s\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v8.4s, v19.4s, v4.4s\n"
+ "ldr q19, [x21, x23]\n"
+ "fmla v10.4s, v13.4s, v0.4s\n"
+ "ldr q12, [x20, x23]\n"
+ "fmla v9.4s, v18.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 176]\n"
+ "fmla v7.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 144]\n"
+ "fmla v8.4s, v13.4s, v5.4s\n"
+ "ldr q11, [x22, x23]\n"
+ "ldr q13, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v9.4s, v17.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 184]\n"
+ "fmla v7.4s, v19.4s, v6.4s\n"
+ "ldr q14, [x21, x23]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr q17, [x22, x23]\n"
+ "ldr x27, [%[inptrs], 152]\n"
+ "ldr x22, [%[inptrs], 192]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str q10, [x21, x24]\n"
+ "fmla v7.4s, v11.4s, v2.4s\n"
+ "fmla v8.4s, v16.4s, v3.4s\n"
+ "ldr q16, [x27, x23]\n"
+ "ldr q15, [x22, x23]\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v9.4s, v12.4s, v3.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v7.4s, v13.4s, v4.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v8.4s, v11.4s, v0.4s\n"
+ "add x23, x23, #16\n"
+ "fmla v9.4s, v14.4s, v0.4s\n"
+ "fmla v7.4s, v14.4s, v5.4s\n"
+ "str q8, [x28, x24]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "str q9, [x21, x24]\n"
+ "fmla v7.4s, v17.4s, v1.4s\n"
+ "fmla v7.4s, v16.4s, v3.4s\n"
+ "fmla v7.4s, v15.4s, v0.4s\n"
+ "str q7, [x28, x24]\n"
+ "add x24, x24, #16\n"
+ "4:\n"
+ "cbz x25, 7f\n"
+ "ldr s13, [%[wbptr]]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "mov v8.16b, v13.16b\n"
+ "ldr s6, [%[wbptr], #8]\n"
+ "mov v9.16b, v13.16b\n"
+ "ldr s5, [%[wbptr], #12]\n"
+ "mov v7.16b, v13.16b\n"
+ "ldr s11, [%[wbptr], #16]\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "ldr x27, [%[inptrs], 120]\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "subs x25, x25, #1\n"
+ "ldr s14, [x19, x23]\n"
+ "ldr s18, [x20, x23]\n"
+ "fmla v10.4s, v14.4s, v12.4s\n"
+ "ldr s14, [x21, x23]\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "ldr s19, [x19, x23]\n"
+ "fmla v10.4s, v18.4s, v11.4s\n"
+ "ldr s15, [x20, x23]\n"
+ "ldr s18, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "ldr s13, [x19, x23]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v8.4s, v14.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v10.4s, v15.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v9.4s, v13.4s, v12.4s\n"
+ "ldr s14, [x20, x23]\n"
+ "ldr s17, [x19, x23]\n"
+ "ldr x22, [%[inptrs], 160]\n"
+ "fmla v8.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 128]\n"
+ "fmla v10.4s, v13.4s, v5.4s\n"
+ "ldr s15, [x22, x23]\n"
+ "fmla v9.4s, v14.4s, v11.4s\n"
+ "ldr s19, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v8.4s, v18.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 168]\n"
+ "fmla v10.4s, v18.4s, v1.4s\n"
+ "ldr s13, [x21, x23]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr s18, [x20, x23]\n"
+ "fmla v7.4s, v13.4s, v12.4s\n"
+ "ldr s17, [x19, x23]\n"
+ "fmla v8.4s, v15.4s, v2.4s\n"
+ "ldr s15, [x22, x23]\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr x27, [%[inptrs], 136]\n"
+ "fmla v9.4s, v13.4s, v2.4s\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v8.4s, v19.4s, v4.4s\n"
+ "ldr s19, [x21, x23]\n"
+ "fmla v10.4s, v13.4s, v0.4s\n"
+ "ldr s12, [x20, x23]\n"
+ "fmla v9.4s, v18.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 176]\n"
+ "fmla v7.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 144]\n"
+ "fmla v8.4s, v13.4s, v5.4s\n"
+ "ldr s11, [x22, x23]\n"
+ "ldr s13, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v9.4s, v17.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 184]\n"
+ "fmla v7.4s, v19.4s, v6.4s\n"
+ "ldr s14, [x21, x23]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr s17, [x22, x23]\n"
+ "ldr x27, [%[inptrs], 152]\n"
+ "ldr x22, [%[inptrs], 192]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str s10, [x21, x24]\n"
+ "fmla v7.4s, v11.4s, v2.4s\n"
+ "fmla v8.4s, v16.4s, v3.4s\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr s15, [x22, x23]\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v9.4s, v12.4s, v3.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v7.4s, v13.4s, v4.4s\n"
+ "ldr s13, [%[wbptr]]\n"
+ "fmla v8.4s, v11.4s, v0.4s\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "mov v10.16b, v13.16b\n"
+ "ldr s6, [%[wbptr], #8]\n"
+ "fmla v9.4s, v14.4s, v0.4s\n"
+ "ldr s11, [%[wbptr], #16]\n"
+ "fmla v7.4s, v14.4s, v5.4s\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "str s8, [x28, x24]\n"
+ "add x23, x23, #4\n"
+ "mov v8.16b, v13.16b\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "str s9, [x21, x24]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "fmla v7.4s, v17.4s, v1.4s\n"
+ "ldr s5, [%[wbptr], #12]\n"
+ "mov v9.16b, v13.16b\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr x27, [%[inptrs], 120]\n"
+ "subs x25, x25, #1\n"
+ "fmla v7.4s, v16.4s, v3.4s\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "ldr s14, [x19, x23]\n"
+ "fmla v10.4s, v14.4s, v12.4s\n"
+ "ldr s18, [x20, x23]\n"
+ "ldr s14, [x21, x23]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "fmla v7.4s, v15.4s, v0.4s\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "ldr s19, [x19, x23]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "fmla v10.4s, v18.4s, v11.4s\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr s15, [x20, x23]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "str s7, [x28, x24]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "mov v7.16b, v13.16b\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "ldr s13, [x19, x23]\n"
+ "ldr s18, [x21, x23]\n"
+ "add x24, x24, #4\n"
+ "fmla v10.4s, v14.4s, v2.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v8.4s, v14.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v10.4s, v15.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v9.4s, v13.4s, v12.4s\n"
+ "ldr s14, [x20, x23]\n"
+ "ldr s17, [x19, x23]\n"
+ "ldr x22, [%[inptrs], 160]\n"
+ "fmla v8.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 128]\n"
+ "fmla v10.4s, v13.4s, v5.4s\n"
+ "ldr s15, [x22, x23]\n"
+ "fmla v9.4s, v14.4s, v11.4s\n"
+ "ldr s19, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v8.4s, v18.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 168]\n"
+ "fmla v10.4s, v18.4s, v1.4s\n"
+ "ldr s13, [x21, x23]\n"
+ "fmla v9.4s, v17.4s, v6.4s\n"
+ "ldr s18, [x20, x23]\n"
+ "fmla v7.4s, v13.4s, v12.4s\n"
+ "ldr s17, [x19, x23]\n"
+ "fmla v8.4s, v15.4s, v2.4s\n"
+ "ldr s15, [x22, x23]\n"
+ "fmla v10.4s, v14.4s, v3.4s\n"
+ "ldr x27, [%[inptrs], 136]\n"
+ "fmla v9.4s, v13.4s, v2.4s\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v8.4s, v19.4s, v4.4s\n"
+ "ldr s19, [x21, x23]\n"
+ "fmla v10.4s, v13.4s, v0.4s\n"
+ "ldr s12, [x20, x23]\n"
+ "fmla v9.4s, v18.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 176]\n"
+ "fmla v7.4s, v16.4s, v11.4s\n"
+ "ldr x27, [%[inptrs], 144]\n"
+ "fmla v8.4s, v13.4s, v5.4s\n"
+ "ldr s11, [x22, x23]\n"
+ "ldr s13, [x27, x23]\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v9.4s, v17.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 184]\n"
+ "fmla v7.4s, v19.4s, v6.4s\n"
+ "ldr s14, [x21, x23]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr s17, [x22, x23]\n"
+ "ldr x27, [%[inptrs], 152]\n"
+ "ldr x22, [%[inptrs], 192]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str s10, [x21, x24]\n"
+ "fmla v7.4s, v11.4s, v2.4s\n"
+ "fmla v8.4s, v16.4s, v3.4s\n"
+ "ldr s16, [x27, x23]\n"
+ "ldr s15, [x22, x23]\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v9.4s, v12.4s, v3.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v7.4s, v13.4s, v4.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v8.4s, v11.4s, v0.4s\n"
+ "add x23, x23, #4\n"
+ "fmla v9.4s, v14.4s, v0.4s\n"
+ "fmla v7.4s, v14.4s, v5.4s\n"
+ "str s8, [x28, x24]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "str s9, [x21, x24]\n"
+ "fmla v7.4s, v17.4s, v1.4s\n"
+ "fmla v7.4s, v16.4s, v3.4s\n"
+ "fmla v7.4s, v15.4s, v0.4s\n"
+ "str s7, [x28, x24]\n"
+ "add x24, x24, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
+)
+{
+ __asm __volatile(
+ "add x24, %[inptr0], %[input_row_stride]\n"
+ "add x27, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x19, %[outptr0], %[output_row_stride]\n"
+ "add x25, x24, %[input_row_stride]\n"
+ "add x23, x27, %[input_col_stride1]\n"
+ "and x20, %[n_channels], #3\n"
+ "add x28, x25, %[input_row_stride]\n"
+ "add x22, x23, %[input_col_stride1]\n"
+ "lsr x21, %[n_channels], #2\n"
+ "add x26, x28, %[input_row_stride]\n"
+ "cbz x21, 4f\n"
+ "1:\n"
+ "ldr q16, [%[wbptr]]\n"
+ "subs x21, x21, #1\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr q4, [%[wbptr], #16]\n"
+ "mov v1.16b, v16.16b\n"
+ "ldr q5, [%[wbptr], #32]\n"
+ "mov v2.16b, v16.16b\n"
+ "ldr q12, [%[wbptr], #48]\n"
+ "mov v0.16b, v16.16b\n"
+ "ldr q11, [%[wbptr], #64]\n"
+ "ldr q10, [%[wbptr], #80]\n"
+ "ldr q6, [%[wbptr], #96]\n"
+ "ldr q9, [%[wbptr], #112]\n"
+ "ldr q8, [%[wbptr], #128]\n"
+ "ldr q7, [%[wbptr], #144]\n"
+ "ldr q21, [%[inptr0]]\n"
+ "fmla v3.4s, v21.4s, v4.4s\n"
+ "ldr q23, [x24]\n"
+ "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr q14, [x25]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr q13, [x24, %[input_col_stride1]]\n"
+ "fmla v3.4s, v23.4s, v11.4s\n"
+ "ldr q18, [%[inptr0], x27]\n"
+ "ldr q15, [x28]\n"
+ "ldr q22, [x25, %[input_col_stride1]]\n"
+ "fmla v3.4s, v19.4s, v5.4s\n"
+ "fmla v3.4s, v14.4s, v9.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v3.4s, v13.4s, v10.4s\n"
+ "ldr q17, [x24, x27]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr q20, [%[inptr0], x23]\n"
+ "fmla v1.4s, v15.4s, v11.4s\n"
+ "ldr q19, [x26]\n"
+ "fmla v3.4s, v18.4s, v12.4s\n"
+ "ldr q13, [x28, %[input_col_stride1]]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr q14, [x25, x27]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "ldr q15, [x24, x23]\n"
+ "fmla v3.4s, v22.4s, v8.4s\n"
+ "ldr q16, [%[inptr0], x22]\n"
+ "fmla v2.4s, v20.4s, v5.4s\n"
+ "ldr q20, [x26, %[input_col_stride1]]\n"
+ "fmla v1.4s, v19.4s, v9.4s\n"
+ "ldr q19, [x28, x27]\n"
+ "fmla v3.4s, v17.4s, v6.4s\n"
+ "ldr q21, [x25, x23]\n"
+ "fmla v2.4s, v14.4s, v9.4s\n"
+ "ldr q22, [x24, x22]\n"
+ "fmla v1.4s, v13.4s, v10.4s\n"
+ "ldr q23, [x26, x27]\n"
+ "fmla v3.4s, v14.4s, v7.4s\n"
+ "ldr q18, [x28, x23]\n"
+ "fmla v0.4s, v14.4s, v4.4s\n"
+ "ldr q13, [x25, x22]\n"
+ "fmla v1.4s, v14.4s, v12.4s\n"
+ "ldr q14, [x26, x23]\n"
+ "fmla v2.4s, v15.4s, v10.4s\n"
+ "ldr q17, [x28, x22]\n"
+ "fmla v0.4s, v19.4s, v11.4s\n"
+ "ldr q15, [x26, x22]\n"
+ "fmla v1.4s, v20.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr q16, [%[wbptr]]\n"
+ "fmla v0.4s, v21.4s, v5.4s\n"
+ "ldr q4, [%[wbptr], #16]\n"
+ "fmla v1.4s, v19.4s, v6.4s\n"
+ "ldr q11, [%[wbptr], #64]\n"
+ "fmla v2.4s, v21.4s, v8.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "ldr q5, [%[wbptr], #32]\n"
+ "fmla v1.4s, v23.4s, v7.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v2.4s, v22.4s, v6.4s\n"
+ "ldr q21, [%[inptr0]]\n"
+ "fmla v0.4s, v18.4s, v10.4s\n"
+ "ldr q9, [%[wbptr], #112]\n"
+ "movi v20.16b, #0\n"
+ "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v13.4s, v7.4s\n"
+ "ldr q18, [%[inptr0], x27]\n"
+ "fmla v0.4s, v13.4s, v12.4s\n"
+ "ldr q10, [%[wbptr], #80]\n"
+ "fmax v3.4s, v3.4s, v20.4s\n"
+ "add x24, x24, #16\n"
+ "fmax v2.4s, v2.4s, v20.4s\n"
+ "ldr q23, [x24]\n"
+ "str q3, [%[outptr0]]\n"
+ "fmla v0.4s, v14.4s, v8.4s\n"
+ "str q2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmax v1.4s, v1.4s, v20.4s\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr q12, [%[wbptr], #48]\n"
+ "str q1, [x19]\n"
+ "fmla v0.4s, v17.4s, v6.4s\n"
+ "mov v1.16b, v16.16b\n"
+ "ldr q8, [%[wbptr], #128]\n"
+ "mov v2.16b, v16.16b\n"
+ "ldr q13, [x24, %[input_col_stride1]]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "ldr q6, [%[wbptr], #96]\n"
+ "fmla v3.4s, v21.4s, v4.4s\n"
+ "add x25, x25, #16\n"
+ "ldr q14, [x25]\n"
+ "add x28, x28, #16\n"
+ "fmax v0.4s, v0.4s, v20.4s\n"
+ "ldr q7, [%[wbptr], #144]\n"
+ "fmla v3.4s, v23.4s, v11.4s\n"
+ "ldr q15, [x28]\n"
+ "str q0, [x19, %[output_col_stride1]]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "mov v0.16b, v16.16b\n"
+ "ldr q22, [x25, %[input_col_stride1]]\n"
+ "fmla v3.4s, v19.4s, v5.4s\n"
+ "add x26, x26, #16\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "add x19, x19, #16\n"
+ "subs x21, x21, #1\n"
+ "fmla v3.4s, v14.4s, v9.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v3.4s, v13.4s, v10.4s\n"
+ "ldr q17, [x24, x27]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr q20, [%[inptr0], x23]\n"
+ "fmla v1.4s, v15.4s, v11.4s\n"
+ "ldr q19, [x26]\n"
+ "fmla v3.4s, v18.4s, v12.4s\n"
+ "ldr q13, [x28, %[input_col_stride1]]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr q14, [x25, x27]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "ldr q15, [x24, x23]\n"
+ "fmla v3.4s, v22.4s, v8.4s\n"
+ "ldr q16, [%[inptr0], x22]\n"
+ "fmla v2.4s, v20.4s, v5.4s\n"
+ "ldr q20, [x26, %[input_col_stride1]]\n"
+ "fmla v1.4s, v19.4s, v9.4s\n"
+ "ldr q19, [x28, x27]\n"
+ "fmla v3.4s, v17.4s, v6.4s\n"
+ "ldr q21, [x25, x23]\n"
+ "fmla v2.4s, v14.4s, v9.4s\n"
+ "ldr q22, [x24, x22]\n"
+ "fmla v1.4s, v13.4s, v10.4s\n"
+ "ldr q23, [x26, x27]\n"
+ "fmla v3.4s, v14.4s, v7.4s\n"
+ "ldr q18, [x28, x23]\n"
+ "fmla v0.4s, v14.4s, v4.4s\n"
+ "ldr q13, [x25, x22]\n"
+ "fmla v1.4s, v14.4s, v12.4s\n"
+ "ldr q14, [x26, x23]\n"
+ "fmla v2.4s, v15.4s, v10.4s\n"
+ "ldr q17, [x28, x22]\n"
+ "fmla v0.4s, v19.4s, v11.4s\n"
+ "ldr q15, [x26, x22]\n"
+ "fmla v1.4s, v20.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v21.4s, v5.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v1.4s, v19.4s, v6.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v2.4s, v21.4s, v8.4s\n"
+ "add x25, x25, #16\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "add x28, x28, #16\n"
+ "fmla v1.4s, v23.4s, v7.4s\n"
+ "add x26, x26, #16\n"
+ "fmla v2.4s, v22.4s, v6.4s\n"
+ "movi v20.16b, #0\n"
+ "fmla v0.4s, v18.4s, v10.4s\n"
+ "fmax v3.4s, v3.4s, v20.4s\n"
+ "fmla v2.4s, v13.4s, v7.4s\n"
+ "fmax v1.4s, v1.4s, v20.4s\n"
+ "str q3, [%[outptr0]]\n"
+ "fmla v0.4s, v13.4s, v12.4s\n"
+ "str q1, [x19]\n"
+ "fmax v2.4s, v2.4s, v20.4s\n"
+ "fmla v0.4s, v14.4s, v8.4s\n"
+ "str q2, [%[outptr0], %[output_col_stride1]]\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v0.4s, v17.4s, v6.4s\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmax v0.4s, v0.4s, v20.4s\n"
+ "str q0, [x19, %[output_col_stride1]]\n"
+ "add x19, x19, #16\n"
+ "4:\n"
+ "cbz x20, 7f\n"
+ "ldr s16, [%[wbptr]]\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr s4, [%[wbptr], #4]\n"
+ "mov v1.16b, v16.16b\n"
+ "ldr s5, [%[wbptr], #8]\n"
+ "mov v2.16b, v16.16b\n"
+ "ldr s12, [%[wbptr], #12]\n"
+ "mov v0.16b, v16.16b\n"
+ "ldr s11, [%[wbptr], #16]\n"
+ "ldr s10, [%[wbptr], #20]\n"
+ "subs x20, x20, #1\n"
+ "ldr s6, [%[wbptr], #24]\n"
+ "ldr s9, [%[wbptr], #28]\n"
+ "ldr s8, [%[wbptr], #32]\n"
+ "ldr s7, [%[wbptr], #36]\n"
+ "ldr s21, [%[inptr0]]\n"
+ "ldr s23, [x24]\n"
+ "fmla v3.4s, v21.4s, v4.4s\n"
+ "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr s14, [x25]\n"
+ "ldr s13, [x24, %[input_col_stride1]]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr s18, [%[inptr0], x27]\n"
+ "fmla v3.4s, v23.4s, v11.4s\n"
+ "ldr s15, [x28]\n"
+ "ldr s22, [x25, %[input_col_stride1]]\n"
+ "fmla v3.4s, v19.4s, v5.4s\n"
+ "fmla v3.4s, v14.4s, v9.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v3.4s, v13.4s, v10.4s\n"
+ "ldr s17, [x24, x27]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr s20, [%[inptr0], x23]\n"
+ "fmla v1.4s, v15.4s, v11.4s\n"
+ "ldr s19, [x26]\n"
+ "fmla v3.4s, v18.4s, v12.4s\n"
+ "ldr s13, [x28, %[input_col_stride1]]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr s14, [x25, x27]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "ldr s15, [x24, x23]\n"
+ "fmla v3.4s, v22.4s, v8.4s\n"
+ "ldr s16, [%[inptr0], x22]\n"
+ "fmla v2.4s, v20.4s, v5.4s\n"
+ "ldr s20, [x26, %[input_col_stride1]]\n"
+ "fmla v1.4s, v19.4s, v9.4s\n"
+ "ldr s19, [x28, x27]\n"
+ "fmla v3.4s, v17.4s, v6.4s\n"
+ "ldr s21, [x25, x23]\n"
+ "fmla v2.4s, v14.4s, v9.4s\n"
+ "ldr s22, [x24, x22]\n"
+ "fmla v1.4s, v13.4s, v10.4s\n"
+ "ldr s23, [x26, x27]\n"
+ "fmla v3.4s, v14.4s, v7.4s\n"
+ "ldr s18, [x28, x23]\n"
+ "fmla v0.4s, v14.4s, v4.4s\n"
+ "ldr s13, [x25, x22]\n"
+ "fmla v1.4s, v14.4s, v12.4s\n"
+ "ldr s14, [x26, x23]\n"
+ "fmla v2.4s, v15.4s, v10.4s\n"
+ "ldr s17, [x28, x22]\n"
+ "fmla v0.4s, v19.4s, v11.4s\n"
+ "ldr s15, [x26, x22]\n"
+ "fmla v1.4s, v20.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr s16, [%[wbptr]]\n"
+ "fmla v0.4s, v21.4s, v5.4s\n"
+ "ldr s4, [%[wbptr], #4]\n"
+ "fmla v1.4s, v19.4s, v6.4s\n"
+ "ldr s11, [%[wbptr], #16]\n"
+ "fmla v2.4s, v21.4s, v8.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "ldr s5, [%[wbptr], #8]\n"
+ "fmla v1.4s, v23.4s, v7.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v2.4s, v22.4s, v6.4s\n"
+ "ldr s21, [%[inptr0]]\n"
+ "fmla v0.4s, v18.4s, v10.4s\n"
+ "ldr s9, [%[wbptr], #28]\n"
+ "movi v20.16b, #0\n"
+ "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v13.4s, v7.4s\n"
+ "ldr s18, [%[inptr0], x27]\n"
+ "fmla v0.4s, v13.4s, v12.4s\n"
+ "ldr s10, [%[wbptr], #20]\n"
+ "fmax v3.4s, v3.4s, v20.4s\n"
+ "add x24, x24, #4\n"
+ "fmax v2.4s, v2.4s, v20.4s\n"
+ "ldr s23, [x24]\n"
+ "str s3, [%[outptr0]]\n"
+ "fmla v0.4s, v14.4s, v8.4s\n"
+ "str s2, [%[outptr0], %[output_col_stride1]]\n"
+ "fmax v1.4s, v1.4s, v20.4s\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr s12, [%[wbptr], #12]\n"
+ "str s1, [x19]\n"
+ "fmla v0.4s, v17.4s, v6.4s\n"
+ "mov v1.16b, v16.16b\n"
+ "ldr s8, [%[wbptr], #32]\n"
+ "mov v2.16b, v16.16b\n"
+ "ldr s13, [x24, %[input_col_stride1]]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "ldr s6, [%[wbptr], #24]\n"
+ "fmla v3.4s, v21.4s, v4.4s\n"
+ "add x25, x25, #4\n"
+ "ldr s14, [x25]\n"
+ "add x28, x28, #4\n"
+ "fmax v0.4s, v0.4s, v20.4s\n"
+ "ldr s7, [%[wbptr], #36]\n"
+ "fmla v3.4s, v23.4s, v11.4s\n"
+ "ldr s15, [x28]\n"
+ "str s0, [x19, %[output_col_stride1]]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "mov v0.16b, v16.16b\n"
+ "ldr s22, [x25, %[input_col_stride1]]\n"
+ "fmla v3.4s, v19.4s, v5.4s\n"
+ "add x26, x26, #4\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "add x19, x19, #4\n"
+ "subs x20, x20, #1\n"
+ "fmla v3.4s, v14.4s, v9.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v3.4s, v13.4s, v10.4s\n"
+ "ldr s17, [x24, x27]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr s20, [%[inptr0], x23]\n"
+ "fmla v1.4s, v15.4s, v11.4s\n"
+ "ldr s19, [x26]\n"
+ "fmla v3.4s, v18.4s, v12.4s\n"
+ "ldr s13, [x28, %[input_col_stride1]]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr s14, [x25, x27]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "ldr s15, [x24, x23]\n"
+ "fmla v3.4s, v22.4s, v8.4s\n"
+ "ldr s16, [%[inptr0], x22]\n"
+ "fmla v2.4s, v20.4s, v5.4s\n"
+ "ldr s20, [x26, %[input_col_stride1]]\n"
+ "fmla v1.4s, v19.4s, v9.4s\n"
+ "ldr s19, [x28, x27]\n"
+ "fmla v3.4s, v17.4s, v6.4s\n"
+ "ldr s21, [x25, x23]\n"
+ "fmla v2.4s, v14.4s, v9.4s\n"
+ "ldr s22, [x24, x22]\n"
+ "fmla v1.4s, v13.4s, v10.4s\n"
+ "ldr s23, [x26, x27]\n"
+ "fmla v3.4s, v14.4s, v7.4s\n"
+ "ldr s18, [x28, x23]\n"
+ "fmla v0.4s, v14.4s, v4.4s\n"
+ "ldr s13, [x25, x22]\n"
+ "fmla v1.4s, v14.4s, v12.4s\n"
+ "ldr s14, [x26, x23]\n"
+ "fmla v2.4s, v15.4s, v10.4s\n"
+ "ldr s17, [x28, x22]\n"
+ "fmla v0.4s, v19.4s, v11.4s\n"
+ "ldr s15, [x26, x22]\n"
+ "fmla v1.4s, v20.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v21.4s, v5.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v1.4s, v19.4s, v6.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v2.4s, v21.4s, v8.4s\n"
+ "add x25, x25, #4\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "add x28, x28, #4\n"
+ "fmla v1.4s, v23.4s, v7.4s\n"
+ "add x26, x26, #4\n"
+ "fmla v2.4s, v22.4s, v6.4s\n"
+ "movi v20.16b, #0\n"
+ "fmla v0.4s, v18.4s, v10.4s\n"
+ "fmax v3.4s, v3.4s, v20.4s\n"
+ "fmla v2.4s, v13.4s, v7.4s\n"
+ "fmax v1.4s, v1.4s, v20.4s\n"
+ "str s3, [%[outptr0]]\n"
+ "fmla v0.4s, v13.4s, v12.4s\n"
+ "str s1, [x19]\n"
+ "fmax v2.4s, v2.4s, v20.4s\n"
+ "fmla v0.4s, v14.4s, v8.4s\n"
+ "str s2, [%[outptr0], %[output_col_stride1]]\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v0.4s, v17.4s, v6.4s\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmax v0.4s, v0.4s, v20.4s\n"
+ "str s0, [x19, %[output_col_stride1]]\n"
+ "add x19, x19, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
+ : [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ __asm __volatile(
+ "mov x22, xzr\n"
+ "mov x26, xzr\n"
+ "and x23, %[n_channels], #3\n"
+ "lsr x24, %[n_channels], #2\n"
+ "cbz x24, 4f\n"
+ "1:\n"
+ "ldr q14, [%[wbptr]]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr q13, [%[wbptr], #16]\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "mov v2.16b, v14.16b\n"
+ "ldr q4, [%[wbptr], #48]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr q12, [%[wbptr], #64]\n"
+ "ldr q9, [%[wbptr], #80]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr q8, [%[wbptr], #96]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr q7, [%[wbptr], #112]\n"
+ "ldr x25, [%[inptrs], 120]\n"
+ "ldr q6, [%[wbptr], #128]\n"
+ "subs x24, x24, #1\n"
+ "ldr q5, [%[wbptr], #144]\n"
+ "ldr q15, [x19, x22]\n"
+ "fmla v3.4s, v15.4s, v13.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "ldr q16, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr q15, [x25, x22]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "ldr q10, [x19, x22]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "ldr q14, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "ldr q18, [x19, x22]\n"
+ "fmla v3.4s, v10.4s, v11.4s\n"
+ "fmla v3.4s, v16.4s, v7.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v1.4s, v16.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v3.4s, v17.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v2.4s, v18.4s, v13.4s\n"
+ "ldr q16, [x20, x22]\n"
+ "movi v10.16b, #0\n"
+ "ldr q17, [x19, x22]\n"
+ "fmla v1.4s, v15.4s, v12.4s\n"
+ "ldr x27, [%[inptrs], 160]\n"
+ "fmla v3.4s, v18.4s, v4.4s\n"
+ "ldr x25, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr q18, [x27, x22]\n"
+ "ldr q15, [x25, x22]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "fmla v1.4s, v14.4s, v11.4s\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "fmla v3.4s, v14.4s, v6.4s\n"
+ "ldr q14, [x21, x22]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "fmla v0.4s, v14.4s, v13.4s\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v1.4s, v18.4s, v7.4s\n"
+ "ldr x27, [%[inptrs], 168]\n"
+ "fmla v3.4s, v16.4s, v8.4s\n"
+ "ldr q18, [x19, x22]\n"
+ "fmla v2.4s, v14.4s, v7.4s\n"
+ "ldr q13, [x27, x22]\n"
+ "ldr x25, [%[inptrs], 136]\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v1.4s, v15.4s, v9.4s\n"
+ "ldr x27, [%[inptrs], 176]\n"
+ "fmla v3.4s, v14.4s, v5.4s\n"
+ "ldr q16, [x25, x22]\n"
+ "fmla v2.4s, v17.4s, v9.4s\n"
+ "ldr q17, [x21, x22]\n"
+ "fmla v0.4s, v16.4s, v12.4s\n"
+ "ldr q12, [x20, x22]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr q15, [x27, x22]\n"
+ "fmax v3.4s, v3.4s, v10.4s\n"
+ "ldr x25, [%[inptrs], 144]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v0.4s, v17.4s, v11.4s\n"
+ "ldr q14, [x25, x22]\n"
+ "fmla v1.4s, v13.4s, v6.4s\n"
+ "ldr q11, [x21, x22]\n"
+ "ldr x27, [%[inptrs], 184]\n"
+ "ldr x25, [%[inptrs], 152]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str q3, [x21, x26]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmla v1.4s, v16.4s, v8.4s\n"
+ "ldr q18, [x27, x22]\n"
+ "ldr q17, [x25, x22]\n"
+ "ldr x27, [%[inptrs], 192]\n"
+ "fmla v2.4s, v12.4s, v8.4s\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v0.4s, v14.4s, v9.4s\n"
+ "ldr q16, [x27, x22]\n"
+ "fmla v1.4s, v15.4s, v5.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "ldr q14, [%[wbptr]]\n"
+ "add x22, x22, #16\n"
+ "fmla v2.4s, v11.4s, v5.4s\n"
+ "ldr q13, [%[wbptr], #16]\n"
+ "fmla v0.4s, v11.4s, v4.4s\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "fmax v1.4s, v1.4s, v10.4s\n"
+ "ldr q12, [%[wbptr], #64]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr q9, [%[wbptr], #80]\n"
+ "fmax v2.4s, v2.4s, v10.4s\n"
+ "ldr q7, [%[wbptr], #112]\n"
+ "str q1, [x28, x26]\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr q4, [%[wbptr], #48]\n"
+ "str q2, [x21, x26]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "mov v2.16b, v14.16b\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v17.4s, v8.4s\n"
+ "ldr q6, [%[wbptr], #128]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr x25, [%[inptrs], 120]\n"
+ "subs x24, x24, #1\n"
+ "ldr q15, [x19, x22]\n"
+ "fmla v0.4s, v16.4s, v5.4s\n"
+ "ldr q8, [%[wbptr], #96]\n"
+ "fmla v3.4s, v15.4s, v13.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "ldr q16, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr q15, [x25, x22]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "fmax v0.4s, v0.4s, v10.4s\n"
+ "ldr q5, [%[wbptr], #144]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr q10, [x19, x22]\n"
+ "ldr q17, [x20, x22]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "str q0, [x28, x26]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr q18, [x19, x22]\n"
+ "fmla v3.4s, v10.4s, v11.4s\n"
+ "ldr q14, [x21, x22]\n"
+ "add x26, x26, #16\n"
+ "fmla v3.4s, v16.4s, v7.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v1.4s, v16.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v3.4s, v17.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v2.4s, v18.4s, v13.4s\n"
+ "ldr q16, [x20, x22]\n"
+ "movi v10.16b, #0\n"
+ "ldr q17, [x19, x22]\n"
+ "fmla v1.4s, v15.4s, v12.4s\n"
+ "ldr x27, [%[inptrs], 160]\n"
+ "fmla v3.4s, v18.4s, v4.4s\n"
+ "ldr x25, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr q18, [x27, x22]\n"
+ "ldr q15, [x25, x22]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "fmla v1.4s, v14.4s, v11.4s\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "fmla v3.4s, v14.4s, v6.4s\n"
+ "ldr q14, [x21, x22]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr q17, [x20, x22]\n"
+ "fmla v0.4s, v14.4s, v13.4s\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v1.4s, v18.4s, v7.4s\n"
+ "ldr x27, [%[inptrs], 168]\n"
+ "fmla v3.4s, v16.4s, v8.4s\n"
+ "ldr q18, [x19, x22]\n"
+ "fmla v2.4s, v14.4s, v7.4s\n"
+ "ldr q13, [x27, x22]\n"
+ "ldr x25, [%[inptrs], 136]\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v1.4s, v15.4s, v9.4s\n"
+ "ldr x27, [%[inptrs], 176]\n"
+ "fmla v3.4s, v14.4s, v5.4s\n"
+ "ldr q16, [x25, x22]\n"
+ "fmla v2.4s, v17.4s, v9.4s\n"
+ "ldr q17, [x21, x22]\n"
+ "fmla v0.4s, v16.4s, v12.4s\n"
+ "ldr q12, [x20, x22]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr q15, [x27, x22]\n"
+ "fmax v3.4s, v3.4s, v10.4s\n"
+ "ldr x25, [%[inptrs], 144]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v0.4s, v17.4s, v11.4s\n"
+ "ldr q14, [x25, x22]\n"
+ "fmla v1.4s, v13.4s, v6.4s\n"
+ "ldr q11, [x21, x22]\n"
+ "ldr x27, [%[inptrs], 184]\n"
+ "ldr x25, [%[inptrs], 152]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str q3, [x21, x26]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmla v1.4s, v16.4s, v8.4s\n"
+ "ldr q18, [x27, x22]\n"
+ "ldr q17, [x25, x22]\n"
+ "ldr x27, [%[inptrs], 192]\n"
+ "fmla v2.4s, v12.4s, v8.4s\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v0.4s, v14.4s, v9.4s\n"
+ "ldr q16, [x27, x22]\n"
+ "fmla v1.4s, v15.4s, v5.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "add x22, x22, #16\n"
+ "fmla v2.4s, v11.4s, v5.4s\n"
+ "fmla v0.4s, v11.4s, v4.4s\n"
+ "fmax v1.4s, v1.4s, v10.4s\n"
+ "fmax v2.4s, v2.4s, v10.4s\n"
+ "str q1, [x28, x26]\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "str q2, [x21, x26]\n"
+ "fmla v0.4s, v17.4s, v8.4s\n"
+ "fmla v0.4s, v16.4s, v5.4s\n"
+ "fmax v0.4s, v0.4s, v10.4s\n"
+ "str q0, [x28, x26]\n"
+ "add x26, x26, #16\n"
+ "4:\n"
+ "cbz x23, 7f\n"
+ "ldr s14, [%[wbptr]]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr s13, [%[wbptr], #4]\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "mov v2.16b, v14.16b\n"
+ "ldr s4, [%[wbptr], #12]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr s12, [%[wbptr], #16]\n"
+ "ldr s9, [%[wbptr], #20]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr s8, [%[wbptr], #24]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr s7, [%[wbptr], #28]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr s6, [%[wbptr], #32]\n"
+ "ldr x25, [%[inptrs], 120]\n"
+ "ldr s5, [%[wbptr], #36]\n"
+ "subs x23, x23, #1\n"
+ "ldr s15, [x19, x22]\n"
+ "ldr s17, [x20, x22]\n"
+ "fmla v3.4s, v15.4s, v13.4s\n"
+ "ldr s16, [x21, x22]\n"
+ "ldr s15, [x25, x22]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "ldr s10, [x19, x22]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr s17, [x20, x22]\n"
+ "ldr s14, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "ldr s18, [x19, x22]\n"
+ "fmla v3.4s, v10.4s, v11.4s\n"
+ "fmla v3.4s, v16.4s, v7.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v1.4s, v16.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v3.4s, v17.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v2.4s, v18.4s, v13.4s\n"
+ "ldr s16, [x20, x22]\n"
+ "movi v10.16b, #0\n"
+ "ldr s17, [x19, x22]\n"
+ "fmla v1.4s, v15.4s, v12.4s\n"
+ "ldr x27, [%[inptrs], 160]\n"
+ "fmla v3.4s, v18.4s, v4.4s\n"
+ "ldr x25, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr s18, [x27, x22]\n"
+ "ldr s15, [x25, x22]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "fmla v1.4s, v14.4s, v11.4s\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "fmla v3.4s, v14.4s, v6.4s\n"
+ "ldr s14, [x21, x22]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr s17, [x20, x22]\n"
+ "fmla v0.4s, v14.4s, v13.4s\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v1.4s, v18.4s, v7.4s\n"
+ "ldr x27, [%[inptrs], 168]\n"
+ "fmla v3.4s, v16.4s, v8.4s\n"
+ "ldr s18, [x19, x22]\n"
+ "fmla v2.4s, v14.4s, v7.4s\n"
+ "ldr s13, [x27, x22]\n"
+ "ldr x25, [%[inptrs], 136]\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v1.4s, v15.4s, v9.4s\n"
+ "ldr x27, [%[inptrs], 176]\n"
+ "fmla v3.4s, v14.4s, v5.4s\n"
+ "ldr s16, [x25, x22]\n"
+ "fmla v2.4s, v17.4s, v9.4s\n"
+ "ldr s17, [x21, x22]\n"
+ "fmla v0.4s, v16.4s, v12.4s\n"
+ "ldr s12, [x20, x22]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr s15, [x27, x22]\n"
+ "fmax v3.4s, v3.4s, v10.4s\n"
+ "ldr x25, [%[inptrs], 144]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v0.4s, v17.4s, v11.4s\n"
+ "ldr s14, [x25, x22]\n"
+ "fmla v1.4s, v13.4s, v6.4s\n"
+ "ldr s11, [x21, x22]\n"
+ "ldr x27, [%[inptrs], 184]\n"
+ "ldr x25, [%[inptrs], 152]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str s3, [x21, x26]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmla v1.4s, v16.4s, v8.4s\n"
+ "ldr s18, [x27, x22]\n"
+ "ldr s17, [x25, x22]\n"
+ "ldr x27, [%[inptrs], 192]\n"
+ "fmla v2.4s, v12.4s, v8.4s\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v0.4s, v14.4s, v9.4s\n"
+ "ldr s16, [x27, x22]\n"
+ "fmla v1.4s, v15.4s, v5.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "ldr s14, [%[wbptr]]\n"
+ "add x22, x22, #4\n"
+ "fmla v2.4s, v11.4s, v5.4s\n"
+ "ldr s13, [%[wbptr], #4]\n"
+ "fmla v0.4s, v11.4s, v4.4s\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "fmax v1.4s, v1.4s, v10.4s\n"
+ "ldr s12, [%[wbptr], #16]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr s9, [%[wbptr], #20]\n"
+ "fmax v2.4s, v2.4s, v10.4s\n"
+ "ldr s7, [%[wbptr], #28]\n"
+ "str s1, [x28, x26]\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr s4, [%[wbptr], #12]\n"
+ "str s2, [x21, x26]\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "mov v2.16b, v14.16b\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v0.4s, v17.4s, v8.4s\n"
+ "ldr s6, [%[wbptr], #32]\n"
+ "ldr x19, [%[inptrs], 0]\n"
+ "ldr x20, [%[inptrs], 40]\n"
+ "ldr x21, [%[inptrs], 80]\n"
+ "ldr x25, [%[inptrs], 120]\n"
+ "subs x23, x23, #1\n"
+ "ldr s15, [x19, x22]\n"
+ "fmla v0.4s, v16.4s, v5.4s\n"
+ "ldr s8, [%[wbptr], #24]\n"
+ "fmla v3.4s, v15.4s, v13.4s\n"
+ "ldr s17, [x20, x22]\n"
+ "ldr s16, [x21, x22]\n"
+ "ldr x19, [%[inptrs], 8]\n"
+ "ldr s15, [x25, x22]\n"
+ "ldr x20, [%[inptrs], 48]\n"
+ "fmax v0.4s, v0.4s, v10.4s\n"
+ "ldr s5, [%[wbptr], #36]\n"
+ "fmla v3.4s, v17.4s, v12.4s\n"
+ "ldr s10, [x19, x22]\n"
+ "ldr s17, [x20, x22]\n"
+ "ldr x19, [%[inptrs], 16]\n"
+ "str s0, [x28, x26]\n"
+ "ldr x21, [%[inptrs], 88]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr s18, [x19, x22]\n"
+ "fmla v3.4s, v10.4s, v11.4s\n"
+ "ldr s14, [x21, x22]\n"
+ "add x26, x26, #4\n"
+ "fmla v3.4s, v16.4s, v7.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v1.4s, v16.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 56]\n"
+ "fmla v3.4s, v17.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 24]\n"
+ "fmla v2.4s, v18.4s, v13.4s\n"
+ "ldr s16, [x20, x22]\n"
+ "movi v10.16b, #0\n"
+ "ldr s17, [x19, x22]\n"
+ "fmla v1.4s, v15.4s, v12.4s\n"
+ "ldr x27, [%[inptrs], 160]\n"
+ "fmla v3.4s, v18.4s, v4.4s\n"
+ "ldr x25, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v12.4s\n"
+ "ldr s18, [x27, x22]\n"
+ "ldr s15, [x25, x22]\n"
+ "ldr x21, [%[inptrs], 96]\n"
+ "fmla v1.4s, v14.4s, v11.4s\n"
+ "ldr x20, [%[inptrs], 64]\n"
+ "fmla v3.4s, v14.4s, v6.4s\n"
+ "ldr s14, [x21, x22]\n"
+ "fmla v2.4s, v17.4s, v11.4s\n"
+ "ldr s17, [x20, x22]\n"
+ "fmla v0.4s, v14.4s, v13.4s\n"
+ "ldr x19, [%[inptrs], 32]\n"
+ "fmla v1.4s, v18.4s, v7.4s\n"
+ "ldr x27, [%[inptrs], 168]\n"
+ "fmla v3.4s, v16.4s, v8.4s\n"
+ "ldr s18, [x19, x22]\n"
+ "fmla v2.4s, v14.4s, v7.4s\n"
+ "ldr s13, [x27, x22]\n"
+ "ldr x25, [%[inptrs], 136]\n"
+ "ldr x21, [%[inptrs], 104]\n"
+ "ldr x20, [%[inptrs], 72]\n"
+ "fmla v1.4s, v15.4s, v9.4s\n"
+ "ldr x27, [%[inptrs], 176]\n"
+ "fmla v3.4s, v14.4s, v5.4s\n"
+ "ldr s16, [x25, x22]\n"
+ "fmla v2.4s, v17.4s, v9.4s\n"
+ "ldr s17, [x21, x22]\n"
+ "fmla v0.4s, v16.4s, v12.4s\n"
+ "ldr s12, [x20, x22]\n"
+ "fmla v1.4s, v14.4s, v4.4s\n"
+ "ldr s15, [x27, x22]\n"
+ "fmax v3.4s, v3.4s, v10.4s\n"
+ "ldr x25, [%[inptrs], 144]\n"
+ "fmla v2.4s, v18.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 112]\n"
+ "fmla v0.4s, v17.4s, v11.4s\n"
+ "ldr s14, [x25, x22]\n"
+ "fmla v1.4s, v13.4s, v6.4s\n"
+ "ldr s11, [x21, x22]\n"
+ "ldr x27, [%[inptrs], 184]\n"
+ "ldr x25, [%[inptrs], 152]\n"
+ "ldr x21, [%[outptrs], 0]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 16]\n"
+ "str s3, [x21, x26]\n"
+ "fmla v0.4s, v15.4s, v7.4s\n"
+ "fmla v1.4s, v16.4s, v8.4s\n"
+ "ldr s18, [x27, x22]\n"
+ "ldr s17, [x25, x22]\n"
+ "ldr x27, [%[inptrs], 192]\n"
+ "fmla v2.4s, v12.4s, v8.4s\n"
+ "ldr x21, [%[outptrs], 8]\n"
+ "fmla v0.4s, v14.4s, v9.4s\n"
+ "ldr s16, [x27, x22]\n"
+ "fmla v1.4s, v15.4s, v5.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "add x22, x22, #4\n"
+ "fmla v2.4s, v11.4s, v5.4s\n"
+ "fmla v0.4s, v11.4s, v4.4s\n"
+ "fmax v1.4s, v1.4s, v10.4s\n"
+ "fmax v2.4s, v2.4s, v10.4s\n"
+ "str s1, [x28, x26]\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "ldr x28, [%[outptrs], 24]\n"
+ "str s2, [x21, x26]\n"
+ "fmla v0.4s, v17.4s, v8.4s\n"
+ "fmla v0.4s, v16.4s, v5.4s\n"
+ "fmax v0.4s, v0.4s, v10.4s\n"
+ "str s0, [x28, x26]\n"
+ "add x26, x26, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
- },
-};
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
+)
+{
+ __asm __volatile(
+ "add x21, %[inptr0], %[input_row_stride]\n"
+ "add x23, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x24, %[outptr0], %[output_row_stride]\n"
+ "add x28, x21, %[input_row_stride]\n"
+ "add x26, x23, %[input_col_stride1]\n"
+ "and x19, %[n_channels], #3\n"
+ "add x27, x28, %[input_row_stride]\n"
+ "add x25, x26, %[input_col_stride1]\n"
+ "lsr x20, %[n_channels], #2\n"
+ "add x22, x27, %[input_row_stride]\n"
+ "cbz x20, 4f\n"
+ "1:\n"
+ "ldr q14, [%[wbptr]]\n"
+ "subs x20, x20, #1\n"
+ "mov v5.16b, v14.16b\n"
+ "ldr q0, [%[wbptr], #16]\n"
+ "mov v11.16b, v14.16b\n"
+ "ldr q1, [%[wbptr], #32]\n"
+ "mov v12.16b, v14.16b\n"
+ "ldr q2, [%[wbptr], #48]\n"
+ "mov v10.16b, v14.16b\n"
+ "ldr q6, [%[wbptr], #64]\n"
+ "ldr q3, [%[wbptr], #80]\n"
+ "ldr q7, [%[wbptr], #96]\n"
+ "ldr q4, [%[wbptr], #112]\n"
+ "ldr q8, [%[wbptr], #128]\n"
+ "ldr q9, [%[wbptr], #144]\n"
+ "ldr q19, [%[inptr0]]\n"
+ "fmla v5.4s, v19.4s, v0.4s\n"
+ "ldr q15, [x21]\n"
+ "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr q16, [x28]\n"
+ "fmla v11.4s, v16.4s, v0.4s\n"
+ "ldr q23, [x21, %[input_col_stride1]]\n"
+ "fmla v5.4s, v15.4s, v6.4s\n"
+ "ldr q18, [%[inptr0], x23]\n"
+ "ldr q17, [x27]\n"
+ "ldr q13, [x28, %[input_col_stride1]]\n"
+ "fmla v5.4s, v21.4s, v1.4s\n"
+ "fmla v5.4s, v16.4s, v4.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v5.4s, v23.4s, v3.4s\n"
+ "ldr q21, [x21, x23]\n"
+ "fmla v12.4s, v18.4s, v0.4s\n"
+ "ldr q20, [%[inptr0], x26]\n"
+ "fmla v11.4s, v17.4s, v6.4s\n"
+ "ldr q19, [x22]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "ldr q15, [x27, %[input_col_stride1]]\n"
+ "fmla v12.4s, v21.4s, v6.4s\n"
+ "ldr q16, [x28, x23]\n"
+ "fmla v11.4s, v13.4s, v1.4s\n"
+ "ldr q17, [x21, x26]\n"
+ "fmla v5.4s, v13.4s, v8.4s\n"
+ "ldr q14, [%[inptr0], x25]\n"
+ "fmla v12.4s, v20.4s, v1.4s\n"
+ "ldr q20, [x22, %[input_col_stride1]]\n"
+ "fmla v11.4s, v19.4s, v4.4s\n"
+ "ldr q19, [x27, x23]\n"
+ "fmla v5.4s, v21.4s, v7.4s\n"
+ "ldr q22, [x28, x26]\n"
+ "fmla v12.4s, v16.4s, v4.4s\n"
+ "ldr q21, [x21, x25]\n"
+ "fmla v11.4s, v15.4s, v3.4s\n"
+ "ldr q23, [x22, x23]\n"
+ "fmla v5.4s, v16.4s, v9.4s\n"
+ "ldr q18, [x27, x26]\n"
+ "fmla v10.4s, v16.4s, v0.4s\n"
+ "ldr q15, [x28, x25]\n"
+ "fmla v11.4s, v16.4s, v2.4s\n"
+ "ldr q16, [x22, x26]\n"
+ "fmla v12.4s, v17.4s, v3.4s\n"
+ "ldr q17, [x27, x25]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "ldr q13, [x22, x25]\n"
+ "fmla v11.4s, v20.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v12.4s, v14.4s, v2.4s\n"
+ "ldr q14, [%[wbptr]]\n"
+ "fmla v10.4s, v22.4s, v1.4s\n"
+ "ldr q0, [%[wbptr], #16]\n"
+ "fmla v11.4s, v19.4s, v7.4s\n"
+ "ldr q6, [%[wbptr], #64]\n"
+ "fmla v12.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v10.4s, v23.4s, v4.4s\n"
+ "ldr q1, [%[wbptr], #32]\n"
+ "fmla v11.4s, v23.4s, v9.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v12.4s, v21.4s, v7.4s\n"
+ "ldr q19, [%[inptr0]]\n"
+ "fmla v10.4s, v18.4s, v3.4s\n"
+ "ldr q4, [%[wbptr], #112]\n"
+ "movi v20.16b, #0\n"
+ "ldr q21, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v12.4s, v15.4s, v9.4s\n"
+ "ldr q18, [%[inptr0], x23]\n"
+ "fmla v10.4s, v15.4s, v2.4s\n"
+ "ldr q3, [%[wbptr], #80]\n"
+ "fmov v22.4s, #6.0\n"
+ "add x21, x21, #16\n"
+ "fmax v5.4s, v5.4s, v20.4s\n"
+ "ldr q15, [x21]\n"
+ "fmla v10.4s, v16.4s, v8.4s\n"
+ "ldr q2, [%[wbptr], #48]\n"
+ "fmin v5.4s, v5.4s, v22.4s\n"
+ "ldr q23, [x21, %[input_col_stride1]]\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "add x28, x28, #16\n"
+ "str q5, [%[outptr0]]\n"
+ "fmla v10.4s, v17.4s, v7.4s\n"
+ "fmin v12.4s, v12.4s, v22.4s\n"
+ "ldr q8, [%[wbptr], #128]\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "ldr q16, [x28]\n"
+ "str q12, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v10.4s, v13.4s, v9.4s\n"
+ "fmin v11.4s, v11.4s, v22.4s\n"
+ "ldr q7, [%[wbptr], #96]\n"
+ "mov v5.16b, v14.16b\n"
+ "ldr q13, [x28, %[input_col_stride1]]\n"
+ "str q11, [x24]\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "mov v11.16b, v14.16b\n"
+ "ldr q9, [%[wbptr], #144]\n"
+ "fmin v10.4s, v10.4s, v22.4s\n"
+ "add x27, x27, #16\n"
+ "mov v12.16b, v14.16b\n"
+ "ldr q17, [x27]\n"
+ "str q10, [x24, %[output_col_stride1]]\n"
+ "fmla v5.4s, v19.4s, v0.4s\n"
+ "mov v10.16b, v14.16b\n"
+ "add x22, x22, #16\n"
+ "fmla v11.4s, v16.4s, v0.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v5.4s, v15.4s, v6.4s\n"
+ "add x24, x24, #16\n"
+ "subs x20, x20, #1\n"
+ "fmla v5.4s, v21.4s, v1.4s\n"
+ "fmla v5.4s, v16.4s, v4.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v5.4s, v23.4s, v3.4s\n"
+ "ldr q21, [x21, x23]\n"
+ "fmla v12.4s, v18.4s, v0.4s\n"
+ "ldr q20, [%[inptr0], x26]\n"
+ "fmla v11.4s, v17.4s, v6.4s\n"
+ "ldr q19, [x22]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "ldr q15, [x27, %[input_col_stride1]]\n"
+ "fmla v12.4s, v21.4s, v6.4s\n"
+ "ldr q16, [x28, x23]\n"
+ "fmla v11.4s, v13.4s, v1.4s\n"
+ "ldr q17, [x21, x26]\n"
+ "fmla v5.4s, v13.4s, v8.4s\n"
+ "ldr q14, [%[inptr0], x25]\n"
+ "fmla v12.4s, v20.4s, v1.4s\n"
+ "ldr q20, [x22, %[input_col_stride1]]\n"
+ "fmla v11.4s, v19.4s, v4.4s\n"
+ "ldr q19, [x27, x23]\n"
+ "fmla v5.4s, v21.4s, v7.4s\n"
+ "ldr q22, [x28, x26]\n"
+ "fmla v12.4s, v16.4s, v4.4s\n"
+ "ldr q21, [x21, x25]\n"
+ "fmla v11.4s, v15.4s, v3.4s\n"
+ "ldr q23, [x22, x23]\n"
+ "fmla v5.4s, v16.4s, v9.4s\n"
+ "ldr q18, [x27, x26]\n"
+ "fmla v10.4s, v16.4s, v0.4s\n"
+ "ldr q15, [x28, x25]\n"
+ "fmla v11.4s, v16.4s, v2.4s\n"
+ "ldr q16, [x22, x26]\n"
+ "fmla v12.4s, v17.4s, v3.4s\n"
+ "ldr q17, [x27, x25]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "ldr q13, [x22, x25]\n"
+ "fmla v11.4s, v20.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v12.4s, v14.4s, v2.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v10.4s, v22.4s, v1.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v11.4s, v19.4s, v7.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v12.4s, v22.4s, v8.4s\n"
+ "add x28, x28, #16\n"
+ "fmla v10.4s, v23.4s, v4.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v11.4s, v23.4s, v9.4s\n"
+ "add x22, x22, #16\n"
+ "fmla v12.4s, v21.4s, v7.4s\n"
+ "movi v20.16b, #0\n"
+ "fmla v10.4s, v18.4s, v3.4s\n"
+ "fmov v22.4s, #6.0\n"
+ "fmax v5.4s, v5.4s, v20.4s\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "fmla v12.4s, v15.4s, v9.4s\n"
+ "fmla v10.4s, v15.4s, v2.4s\n"
+ "fmin v5.4s, v5.4s, v22.4s\n"
+ "fmin v11.4s, v11.4s, v22.4s\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "str q5, [%[outptr0]]\n"
+ "str q11, [x24]\n"
+ "fmla v10.4s, v16.4s, v8.4s\n"
+ "fmin v12.4s, v12.4s, v22.4s\n"
+ "str q12, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v10.4s, v17.4s, v7.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v10.4s, v13.4s, v9.4s\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "fmin v10.4s, v10.4s, v22.4s\n"
+ "str q10, [x24, %[output_col_stride1]]\n"
+ "add x24, x24, #16\n"
+ "4:\n"
+ "cbz x19, 7f\n"
+ "ldr s14, [%[wbptr]]\n"
+ "mov v5.16b, v14.16b\n"
+ "ldr s0, [%[wbptr], #4]\n"
+ "mov v11.16b, v14.16b\n"
+ "ldr s1, [%[wbptr], #8]\n"
+ "mov v12.16b, v14.16b\n"
+ "ldr s2, [%[wbptr], #12]\n"
+ "mov v10.16b, v14.16b\n"
+ "ldr s6, [%[wbptr], #16]\n"
+ "ldr s3, [%[wbptr], #20]\n"
+ "subs x19, x19, #1\n"
+ "ldr s7, [%[wbptr], #24]\n"
+ "ldr s4, [%[wbptr], #28]\n"
+ "ldr s8, [%[wbptr], #32]\n"
+ "ldr s9, [%[wbptr], #36]\n"
+ "ldr s19, [%[inptr0]]\n"
+ "ldr s15, [x21]\n"
+ "fmla v5.4s, v19.4s, v0.4s\n"
+ "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr s16, [x28]\n"
+ "ldr s23, [x21, %[input_col_stride1]]\n"
+ "fmla v11.4s, v16.4s, v0.4s\n"
+ "ldr s18, [%[inptr0], x23]\n"
+ "fmla v5.4s, v15.4s, v6.4s\n"
+ "ldr s17, [x27]\n"
+ "ldr s13, [x28, %[input_col_stride1]]\n"
+ "fmla v5.4s, v21.4s, v1.4s\n"
+ "fmla v5.4s, v16.4s, v4.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v5.4s, v23.4s, v3.4s\n"
+ "ldr s21, [x21, x23]\n"
+ "fmla v12.4s, v18.4s, v0.4s\n"
+ "ldr s20, [%[inptr0], x26]\n"
+ "fmla v11.4s, v17.4s, v6.4s\n"
+ "ldr s19, [x22]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "ldr s15, [x27, %[input_col_stride1]]\n"
+ "fmla v12.4s, v21.4s, v6.4s\n"
+ "ldr s16, [x28, x23]\n"
+ "fmla v11.4s, v13.4s, v1.4s\n"
+ "ldr s17, [x21, x26]\n"
+ "fmla v5.4s, v13.4s, v8.4s\n"
+ "ldr s14, [%[inptr0], x25]\n"
+ "fmla v12.4s, v20.4s, v1.4s\n"
+ "ldr s20, [x22, %[input_col_stride1]]\n"
+ "fmla v11.4s, v19.4s, v4.4s\n"
+ "ldr s19, [x27, x23]\n"
+ "fmla v5.4s, v21.4s, v7.4s\n"
+ "ldr s22, [x28, x26]\n"
+ "fmla v12.4s, v16.4s, v4.4s\n"
+ "ldr s21, [x21, x25]\n"
+ "fmla v11.4s, v15.4s, v3.4s\n"
+ "ldr s23, [x22, x23]\n"
+ "fmla v5.4s, v16.4s, v9.4s\n"
+ "ldr s18, [x27, x26]\n"
+ "fmla v10.4s, v16.4s, v0.4s\n"
+ "ldr s15, [x28, x25]\n"
+ "fmla v11.4s, v16.4s, v2.4s\n"
+ "ldr s16, [x22, x26]\n"
+ "fmla v12.4s, v17.4s, v3.4s\n"
+ "ldr s17, [x27, x25]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "ldr s13, [x22, x25]\n"
+ "fmla v11.4s, v20.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v12.4s, v14.4s, v2.4s\n"
+ "ldr s14, [%[wbptr]]\n"
+ "fmla v10.4s, v22.4s, v1.4s\n"
+ "ldr s0, [%[wbptr], #4]\n"
+ "fmla v11.4s, v19.4s, v7.4s\n"
+ "ldr s6, [%[wbptr], #16]\n"
+ "fmla v12.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v10.4s, v23.4s, v4.4s\n"
+ "ldr s1, [%[wbptr], #8]\n"
+ "fmla v11.4s, v23.4s, v9.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v12.4s, v21.4s, v7.4s\n"
+ "ldr s19, [%[inptr0]]\n"
+ "fmla v10.4s, v18.4s, v3.4s\n"
+ "ldr s4, [%[wbptr], #28]\n"
+ "movi v20.16b, #0\n"
+ "ldr s21, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v12.4s, v15.4s, v9.4s\n"
+ "ldr s18, [%[inptr0], x23]\n"
+ "fmla v10.4s, v15.4s, v2.4s\n"
+ "ldr s3, [%[wbptr], #20]\n"
+ "fmov v22.4s, #6.0\n"
+ "add x21, x21, #4\n"
+ "fmax v5.4s, v5.4s, v20.4s\n"
+ "ldr s15, [x21]\n"
+ "fmla v10.4s, v16.4s, v8.4s\n"
+ "ldr s2, [%[wbptr], #12]\n"
+ "fmin v5.4s, v5.4s, v22.4s\n"
+ "ldr s23, [x21, %[input_col_stride1]]\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "add x28, x28, #4\n"
+ "str s5, [%[outptr0]]\n"
+ "fmla v10.4s, v17.4s, v7.4s\n"
+ "fmin v12.4s, v12.4s, v22.4s\n"
+ "ldr s8, [%[wbptr], #32]\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "ldr s16, [x28]\n"
+ "str s12, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v10.4s, v13.4s, v9.4s\n"
+ "fmin v11.4s, v11.4s, v22.4s\n"
+ "ldr s7, [%[wbptr], #24]\n"
+ "mov v5.16b, v14.16b\n"
+ "ldr s13, [x28, %[input_col_stride1]]\n"
+ "str s11, [x24]\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "mov v11.16b, v14.16b\n"
+ "ldr s9, [%[wbptr], #36]\n"
+ "fmin v10.4s, v10.4s, v22.4s\n"
+ "add x27, x27, #4\n"
+ "mov v12.16b, v14.16b\n"
+ "ldr s17, [x27]\n"
+ "str s10, [x24, %[output_col_stride1]]\n"
+ "fmla v5.4s, v19.4s, v0.4s\n"
+ "mov v10.16b, v14.16b\n"
+ "add x22, x22, #4\n"
+ "fmla v11.4s, v16.4s, v0.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v5.4s, v15.4s, v6.4s\n"
+ "add x24, x24, #4\n"
+ "subs x19, x19, #1\n"
+ "fmla v5.4s, v21.4s, v1.4s\n"
+ "fmla v5.4s, v16.4s, v4.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v5.4s, v23.4s, v3.4s\n"
+ "ldr s21, [x21, x23]\n"
+ "fmla v12.4s, v18.4s, v0.4s\n"
+ "ldr s20, [%[inptr0], x26]\n"
+ "fmla v11.4s, v17.4s, v6.4s\n"
+ "ldr s19, [x22]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "ldr s15, [x27, %[input_col_stride1]]\n"
+ "fmla v12.4s, v21.4s, v6.4s\n"
+ "ldr s16, [x28, x23]\n"
+ "fmla v11.4s, v13.4s, v1.4s\n"
+ "ldr s17, [x21, x26]\n"
+ "fmla v5.4s, v13.4s, v8.4s\n"
+ "ldr s14, [%[inptr0], x25]\n"
+ "fmla v12.4s, v20.4s, v1.4s\n"
+ "ldr s20, [x22, %[input_col_stride1]]\n"
+ "fmla v11.4s, v19.4s, v4.4s\n"
+ "ldr s19, [x27, x23]\n"
+ "fmla v5.4s, v21.4s, v7.4s\n"
+ "ldr s22, [x28, x26]\n"
+ "fmla v12.4s, v16.4s, v4.4s\n"
+ "ldr s21, [x21, x25]\n"
+ "fmla v11.4s, v15.4s, v3.4s\n"
+ "ldr s23, [x22, x23]\n"
+ "fmla v5.4s, v16.4s, v9.4s\n"
+ "ldr s18, [x27, x26]\n"
+ "fmla v10.4s, v16.4s, v0.4s\n"
+ "ldr s15, [x28, x25]\n"
+ "fmla v11.4s, v16.4s, v2.4s\n"
+ "ldr s16, [x22, x26]\n"
+ "fmla v12.4s, v17.4s, v3.4s\n"
+ "ldr s17, [x27, x25]\n"
+ "fmla v10.4s, v19.4s, v6.4s\n"
+ "ldr s13, [x22, x25]\n"
+ "fmla v11.4s, v20.4s, v8.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v12.4s, v14.4s, v2.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v10.4s, v22.4s, v1.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v11.4s, v19.4s, v7.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v12.4s, v22.4s, v8.4s\n"
+ "add x28, x28, #4\n"
+ "fmla v10.4s, v23.4s, v4.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v11.4s, v23.4s, v9.4s\n"
+ "add x22, x22, #4\n"
+ "fmla v12.4s, v21.4s, v7.4s\n"
+ "movi v20.16b, #0\n"
+ "fmla v10.4s, v18.4s, v3.4s\n"
+ "fmov v22.4s, #6.0\n"
+ "fmax v5.4s, v5.4s, v20.4s\n"
+ "fmax v11.4s, v11.4s, v20.4s\n"
+ "fmla v12.4s, v15.4s, v9.4s\n"
+ "fmla v10.4s, v15.4s, v2.4s\n"
+ "fmin v5.4s, v5.4s, v22.4s\n"
+ "fmin v11.4s, v11.4s, v22.4s\n"
+ "fmax v12.4s, v12.4s, v20.4s\n"
+ "str s5, [%[outptr0]]\n"
+ "str s11, [x24]\n"
+ "fmla v10.4s, v16.4s, v8.4s\n"
+ "fmin v12.4s, v12.4s, v22.4s\n"
+ "str s12, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v10.4s, v17.4s, v7.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v10.4s, v13.4s, v9.4s\n"
+ "fmax v10.4s, v10.4s, v20.4s\n"
+ "fmin v10.4s, v10.4s, v22.4s\n"
+ "str s10, [x24, %[output_col_stride1]]\n"
+ "add x24, x24, #4\n"
+ "7:\n"
+ : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
+ : [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
- },
-};
-
template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ __asm __volatile(
+ "mov x27, xzr\n"
+ "mov x28, xzr\n"
+ "and x26, %[n_channels], #3\n"
+ "lsr x25, %[n_channels], #2\n"
+ "cbz x25, 4f\n"
+ "1:\n"
+ "ldr q15, [%[wbptr]]\n"
+ "ldr x21, [%[inptrs], 0]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr q14, [%[wbptr], #16]\n"
+ "mov v3.16b, v15.16b\n"
+ "ldr q10, [%[wbptr], #32]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr q7, [%[wbptr], #48]\n"
+ "mov v4.16b, v15.16b\n"
+ "ldr q13, [%[wbptr], #64]\n"
+ "ldr q5, [%[wbptr], #80]\n"
+ "ldr x22, [%[inptrs], 40]\n"
+ "ldr q0, [%[wbptr], #96]\n"
+ "ldr x20, [%[inptrs], 80]\n"
+ "ldr q9, [%[wbptr], #112]\n"
+ "ldr x23, [%[inptrs], 120]\n"
+ "ldr q6, [%[wbptr], #128]\n"
+ "subs x25, x25, #1\n"
+ "ldr q1, [%[wbptr], #144]\n"
+ "ldr q17, [x21, x27]\n"
+ "fmla v8.4s, v17.4s, v14.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "ldr q16, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 8]\n"
+ "ldr q17, [x23, x27]\n"
+ "ldr x22, [%[inptrs], 48]\n"
+ "ldr q11, [x21, x27]\n"
+ "ldr x20, [%[inptrs], 88]\n"
+ "fmla v8.4s, v18.4s, v13.4s\n"
+ "ldr q19, [x22, x27]\n"
+ "ldr q15, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 16]\n"
+ "ldr q12, [x21, x27]\n"
+ "fmla v8.4s, v11.4s, v10.4s\n"
+ "fmla v8.4s, v16.4s, v9.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v3.4s, v16.4s, v14.4s\n"
+ "ldr x22, [%[inptrs], 56]\n"
+ "fmla v8.4s, v19.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 24]\n"
+ "fmla v2.4s, v12.4s, v14.4s\n"
+ "ldr q16, [x22, x27]\n"
+ "movi v11.16b, #0\n"
+ "ldr q18, [x21, x27]\n"
+ "fmla v3.4s, v17.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v8.4s, v12.4s, v7.4s\n"
+ "ldr x23, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v13.4s\n"
+ "ldr q19, [x20, x27]\n"
+ "fmov v12.4s, #6.0\n"
+ "ldr q17, [x23, x27]\n"
+ "fmla v3.4s, v15.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 96]\n"
+ "fmla v8.4s, v15.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 64]\n"
+ "fmla v2.4s, v18.4s, v10.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v14.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "ldr x21, [%[inptrs], 32]\n"
+ "fmla v8.4s, v16.4s, v0.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v2.4s, v15.4s, v9.4s\n"
+ "ldr q19, [x21, x27]\n"
+ "ldr q16, [x20, x27]\n"
+ "ldr x23, [%[inptrs], 136]\n"
+ "fmla v3.4s, v17.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 104]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr q14, [x23, x27]\n"
+ "fmla v2.4s, v18.4s, v5.4s\n"
+ "ldr q17, [x20, x27]\n"
+ "fmla v4.4s, v14.4s, v13.4s\n"
+ "ldr x22, [%[inptrs], 72]\n"
+ "fmla v3.4s, v15.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmax v8.4s, v8.4s, v11.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "fmla v2.4s, v19.4s, v7.4s\n"
+ "ldr q13, [x20, x27]\n"
+ "fmla v4.4s, v17.4s, v10.4s\n"
+ "ldr x23, [%[inptrs], 144]\n"
+ "fmla v3.4s, v16.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 112]\n"
+ "fmin v8.4s, v8.4s, v12.4s\n"
+ "ldr q10, [x23, x27]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "fmla v4.4s, v13.4s, v9.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v3.4s, v14.4s, v0.4s\n"
+ "ldr x23, [%[inptrs], 152]\n"
+ "ldr q9, [x20, x27]\n"
+ "ldr x22, [%[outptrs], 0]\n"
+ "fmla v2.4s, v18.4s, v0.4s\n"
+ "ldr q19, [x23, x27]\n"
+ "str q8, [x22, x28]\n"
+ "fmla v4.4s, v10.4s, v5.4s\n"
+ "fmla v3.4s, v13.4s, v1.4s\n"
+ "ldr x20, [%[inptrs], 192]\n"
+ "ldr x22, [%[outptrs], 8]\n"
+ "ldr x24, [%[outptrs], 16]\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v2.4s, v15.4s, v1.4s\n"
+ "ldr q16, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v7.4s\n"
+ "ldr q15, [%[wbptr]]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "ldr q14, [%[wbptr], #16]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr q10, [%[wbptr], #32]\n"
+ "fmax v2.4s, v2.4s, v11.4s\n"
+ "ldr q13, [%[wbptr], #64]\n"
+ "fmla v4.4s, v9.4s, v6.4s\n"
+ "ldr q7, [%[wbptr], #48]\n"
+ "fmin v3.4s, v3.4s, v12.4s\n"
+ "ldr q5, [%[wbptr], #80]\n"
+ "fmin v2.4s, v2.4s, v12.4s\n"
+ "ldr q9, [%[wbptr], #112]\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "add x27, x27, #16\n"
+ "str q3, [x24, x28]\n"
+ "fmla v4.4s, v19.4s, v0.4s\n"
+ "str q2, [x22, x28]\n"
+ "mov v3.16b, v15.16b\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr q6, [%[wbptr], #128]\n"
+ "ldr x24, [%[outptrs], 24]\n"
+ "ldr x21, [%[inptrs], 0]\n"
+ "ldr x22, [%[inptrs], 40]\n"
+ "fmla v4.4s, v16.4s, v1.4s\n"
+ "ldr q0, [%[wbptr], #96]\n"
+ "ldr q17, [x21, x27]\n"
+ "ldr x20, [%[inptrs], 80]\n"
+ "fmla v8.4s, v17.4s, v14.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "ldr q16, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 8]\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "ldr q1, [%[wbptr], #144]\n"
+ "ldr q11, [x21, x27]\n"
+ "ldr x22, [%[inptrs], 48]\n"
+ "fmla v8.4s, v18.4s, v13.4s\n"
+ "ldr x21, [%[inptrs], 16]\n"
+ "fmin v4.4s, v4.4s, v12.4s\n"
+ "ldr q19, [x22, x27]\n"
+ "ldr q12, [x21, x27]\n"
+ "ldr x23, [%[inptrs], 120]\n"
+ "ldr x20, [%[inptrs], 88]\n"
+ "subs x25, x25, #1\n"
+ "str q4, [x24, x28]\n"
+ "mov v4.16b, v15.16b\n"
+ "ldr q17, [x23, x27]\n"
+ "fmla v8.4s, v11.4s, v10.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "add x28, x28, #16\n"
+ "fmla v8.4s, v16.4s, v9.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v3.4s, v16.4s, v14.4s\n"
+ "ldr x22, [%[inptrs], 56]\n"
+ "fmla v8.4s, v19.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 24]\n"
+ "fmla v2.4s, v12.4s, v14.4s\n"
+ "ldr q16, [x22, x27]\n"
+ "movi v11.16b, #0\n"
+ "ldr q18, [x21, x27]\n"
+ "fmla v3.4s, v17.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v8.4s, v12.4s, v7.4s\n"
+ "ldr x23, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v13.4s\n"
+ "ldr q19, [x20, x27]\n"
+ "fmov v12.4s, #6.0\n"
+ "ldr q17, [x23, x27]\n"
+ "fmla v3.4s, v15.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 96]\n"
+ "fmla v8.4s, v15.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 64]\n"
+ "fmla v2.4s, v18.4s, v10.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v14.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "ldr x21, [%[inptrs], 32]\n"
+ "fmla v8.4s, v16.4s, v0.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v2.4s, v15.4s, v9.4s\n"
+ "ldr q19, [x21, x27]\n"
+ "ldr q16, [x20, x27]\n"
+ "ldr x23, [%[inptrs], 136]\n"
+ "fmla v3.4s, v17.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 104]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr q14, [x23, x27]\n"
+ "fmla v2.4s, v18.4s, v5.4s\n"
+ "ldr q17, [x20, x27]\n"
+ "fmla v4.4s, v14.4s, v13.4s\n"
+ "ldr x22, [%[inptrs], 72]\n"
+ "fmla v3.4s, v15.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmax v8.4s, v8.4s, v11.4s\n"
+ "ldr q18, [x22, x27]\n"
+ "fmla v2.4s, v19.4s, v7.4s\n"
+ "ldr q13, [x20, x27]\n"
+ "fmla v4.4s, v17.4s, v10.4s\n"
+ "ldr x23, [%[inptrs], 144]\n"
+ "fmla v3.4s, v16.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 112]\n"
+ "fmin v8.4s, v8.4s, v12.4s\n"
+ "ldr q10, [x23, x27]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr q15, [x20, x27]\n"
+ "fmla v4.4s, v13.4s, v9.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v3.4s, v14.4s, v0.4s\n"
+ "ldr x23, [%[inptrs], 152]\n"
+ "ldr q9, [x20, x27]\n"
+ "ldr x22, [%[outptrs], 0]\n"
+ "fmla v2.4s, v18.4s, v0.4s\n"
+ "ldr q19, [x23, x27]\n"
+ "str q8, [x22, x28]\n"
+ "fmla v4.4s, v10.4s, v5.4s\n"
+ "fmla v3.4s, v13.4s, v1.4s\n"
+ "ldr x20, [%[inptrs], 192]\n"
+ "ldr x22, [%[outptrs], 8]\n"
+ "ldr x24, [%[outptrs], 16]\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v2.4s, v15.4s, v1.4s\n"
+ "ldr q16, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v7.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "add x27, x27, #16\n"
+ "fmax v2.4s, v2.4s, v11.4s\n"
+ "fmla v4.4s, v9.4s, v6.4s\n"
+ "fmin v3.4s, v3.4s, v12.4s\n"
+ "fmin v2.4s, v2.4s, v12.4s\n"
+ "str q3, [x24, x28]\n"
+ "fmla v4.4s, v19.4s, v0.4s\n"
+ "str q2, [x22, x28]\n"
+ "ldr x24, [%[outptrs], 24]\n"
+ "fmla v4.4s, v16.4s, v1.4s\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "fmin v4.4s, v4.4s, v12.4s\n"
+ "str q4, [x24, x28]\n"
+ "add x28, x28, #16\n"
+ "4:\n"
+ "cbz x26, 7f\n"
+ "ldr s15, [%[wbptr]]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr s14, [%[wbptr], #4]\n"
+ "mov v3.16b, v15.16b\n"
+ "ldr s10, [%[wbptr], #8]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr s7, [%[wbptr], #12]\n"
+ "mov v4.16b, v15.16b\n"
+ "ldr s13, [%[wbptr], #16]\n"
+ "ldr s5, [%[wbptr], #20]\n"
+ "ldr x21, [%[inptrs], 0]\n"
+ "ldr s0, [%[wbptr], #24]\n"
+ "ldr x22, [%[inptrs], 40]\n"
+ "ldr s9, [%[wbptr], #28]\n"
+ "ldr x20, [%[inptrs], 80]\n"
+ "ldr s6, [%[wbptr], #32]\n"
+ "ldr x23, [%[inptrs], 120]\n"
+ "ldr s1, [%[wbptr], #36]\n"
+ "subs x26, x26, #1\n"
+ "ldr s17, [x21, x27]\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v8.4s, v17.4s, v14.4s\n"
+ "ldr s16, [x20, x27]\n"
+ "ldr s17, [x23, x27]\n"
+ "ldr x21, [%[inptrs], 8]\n"
+ "ldr x22, [%[inptrs], 48]\n"
+ "ldr x20, [%[inptrs], 88]\n"
+ "ldr s11, [x21, x27]\n"
+ "fmla v8.4s, v18.4s, v13.4s\n"
+ "ldr s19, [x22, x27]\n"
+ "ldr s15, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 16]\n"
+ "ldr s12, [x21, x27]\n"
+ "fmla v8.4s, v11.4s, v10.4s\n"
+ "fmla v8.4s, v16.4s, v9.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v3.4s, v16.4s, v14.4s\n"
+ "ldr x22, [%[inptrs], 56]\n"
+ "fmla v8.4s, v19.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 24]\n"
+ "fmla v2.4s, v12.4s, v14.4s\n"
+ "ldr s16, [x22, x27]\n"
+ "movi v11.16b, #0\n"
+ "ldr s18, [x21, x27]\n"
+ "fmla v3.4s, v17.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v8.4s, v12.4s, v7.4s\n"
+ "ldr x23, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v13.4s\n"
+ "ldr s19, [x20, x27]\n"
+ "fmov v12.4s, #6.0\n"
+ "ldr s17, [x23, x27]\n"
+ "fmla v3.4s, v15.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 96]\n"
+ "fmla v8.4s, v15.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 64]\n"
+ "fmla v2.4s, v18.4s, v10.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v14.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "ldr x21, [%[inptrs], 32]\n"
+ "fmla v8.4s, v16.4s, v0.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v2.4s, v15.4s, v9.4s\n"
+ "ldr s19, [x21, x27]\n"
+ "ldr s16, [x20, x27]\n"
+ "ldr x23, [%[inptrs], 136]\n"
+ "fmla v3.4s, v17.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 104]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr s14, [x23, x27]\n"
+ "fmla v2.4s, v18.4s, v5.4s\n"
+ "ldr s17, [x20, x27]\n"
+ "fmla v4.4s, v14.4s, v13.4s\n"
+ "ldr x22, [%[inptrs], 72]\n"
+ "fmla v3.4s, v15.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmax v8.4s, v8.4s, v11.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v2.4s, v19.4s, v7.4s\n"
+ "ldr s13, [x20, x27]\n"
+ "fmla v4.4s, v17.4s, v10.4s\n"
+ "ldr x23, [%[inptrs], 144]\n"
+ "fmla v3.4s, v16.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 112]\n"
+ "fmin v8.4s, v8.4s, v12.4s\n"
+ "ldr s10, [x23, x27]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "fmla v4.4s, v13.4s, v9.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v3.4s, v14.4s, v0.4s\n"
+ "ldr x23, [%[inptrs], 152]\n"
+ "ldr s9, [x20, x27]\n"
+ "ldr x22, [%[outptrs], 0]\n"
+ "fmla v2.4s, v18.4s, v0.4s\n"
+ "ldr s19, [x23, x27]\n"
+ "str s8, [x22, x28]\n"
+ "fmla v4.4s, v10.4s, v5.4s\n"
+ "fmla v3.4s, v13.4s, v1.4s\n"
+ "ldr x20, [%[inptrs], 192]\n"
+ "ldr x22, [%[outptrs], 8]\n"
+ "ldr x24, [%[outptrs], 16]\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v2.4s, v15.4s, v1.4s\n"
+ "ldr s16, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v7.4s\n"
+ "ldr s15, [%[wbptr]]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "ldr s14, [%[wbptr], #4]\n"
+ "mov v8.16b, v15.16b\n"
+ "ldr s10, [%[wbptr], #8]\n"
+ "fmax v2.4s, v2.4s, v11.4s\n"
+ "ldr s13, [%[wbptr], #16]\n"
+ "fmla v4.4s, v9.4s, v6.4s\n"
+ "ldr s7, [%[wbptr], #12]\n"
+ "fmin v3.4s, v3.4s, v12.4s\n"
+ "ldr s5, [%[wbptr], #20]\n"
+ "fmin v2.4s, v2.4s, v12.4s\n"
+ "ldr s9, [%[wbptr], #28]\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "add x27, x27, #4\n"
+ "str s3, [x24, x28]\n"
+ "fmla v4.4s, v19.4s, v0.4s\n"
+ "str s2, [x22, x28]\n"
+ "mov v3.16b, v15.16b\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr s6, [%[wbptr], #32]\n"
+ "ldr x24, [%[outptrs], 24]\n"
+ "ldr x21, [%[inptrs], 0]\n"
+ "ldr x22, [%[inptrs], 40]\n"
+ "fmla v4.4s, v16.4s, v1.4s\n"
+ "ldr s0, [%[wbptr], #24]\n"
+ "ldr s17, [x21, x27]\n"
+ "ldr x20, [%[inptrs], 80]\n"
+ "fmla v8.4s, v17.4s, v14.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "ldr s16, [x20, x27]\n"
+ "ldr x21, [%[inptrs], 8]\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "ldr s1, [%[wbptr], #36]\n"
+ "ldr s11, [x21, x27]\n"
+ "ldr x22, [%[inptrs], 48]\n"
+ "fmla v8.4s, v18.4s, v13.4s\n"
+ "ldr x21, [%[inptrs], 16]\n"
+ "fmin v4.4s, v4.4s, v12.4s\n"
+ "ldr s19, [x22, x27]\n"
+ "ldr s12, [x21, x27]\n"
+ "ldr x23, [%[inptrs], 120]\n"
+ "ldr x20, [%[inptrs], 88]\n"
+ "subs x26, x26, #1\n"
+ "str s4, [x24, x28]\n"
+ "mov v4.16b, v15.16b\n"
+ "ldr s17, [x23, x27]\n"
+ "fmla v8.4s, v11.4s, v10.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "add x28, x28, #4\n"
+ "fmla v8.4s, v16.4s, v9.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v3.4s, v16.4s, v14.4s\n"
+ "ldr x22, [%[inptrs], 56]\n"
+ "fmla v8.4s, v19.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 24]\n"
+ "fmla v2.4s, v12.4s, v14.4s\n"
+ "ldr s16, [x22, x27]\n"
+ "movi v11.16b, #0\n"
+ "ldr s18, [x21, x27]\n"
+ "fmla v3.4s, v17.4s, v13.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v8.4s, v12.4s, v7.4s\n"
+ "ldr x23, [%[inptrs], 128]\n"
+ "fmla v2.4s, v16.4s, v13.4s\n"
+ "ldr s19, [x20, x27]\n"
+ "fmov v12.4s, #6.0\n"
+ "ldr s17, [x23, x27]\n"
+ "fmla v3.4s, v15.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 96]\n"
+ "fmla v8.4s, v15.4s, v6.4s\n"
+ "ldr x22, [%[inptrs], 64]\n"
+ "fmla v2.4s, v18.4s, v10.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v14.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v3.4s, v19.4s, v9.4s\n"
+ "ldr x21, [%[inptrs], 32]\n"
+ "fmla v8.4s, v16.4s, v0.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v2.4s, v15.4s, v9.4s\n"
+ "ldr s19, [x21, x27]\n"
+ "ldr s16, [x20, x27]\n"
+ "ldr x23, [%[inptrs], 136]\n"
+ "fmla v3.4s, v17.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 104]\n"
+ "fmla v8.4s, v15.4s, v1.4s\n"
+ "ldr s14, [x23, x27]\n"
+ "fmla v2.4s, v18.4s, v5.4s\n"
+ "ldr s17, [x20, x27]\n"
+ "fmla v4.4s, v14.4s, v13.4s\n"
+ "ldr x22, [%[inptrs], 72]\n"
+ "fmla v3.4s, v15.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmax v8.4s, v8.4s, v11.4s\n"
+ "ldr s18, [x22, x27]\n"
+ "fmla v2.4s, v19.4s, v7.4s\n"
+ "ldr s13, [x20, x27]\n"
+ "fmla v4.4s, v17.4s, v10.4s\n"
+ "ldr x23, [%[inptrs], 144]\n"
+ "fmla v3.4s, v16.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 112]\n"
+ "fmin v8.4s, v8.4s, v12.4s\n"
+ "ldr s10, [x23, x27]\n"
+ "fmla v2.4s, v17.4s, v6.4s\n"
+ "ldr s15, [x20, x27]\n"
+ "fmla v4.4s, v13.4s, v9.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v3.4s, v14.4s, v0.4s\n"
+ "ldr x23, [%[inptrs], 152]\n"
+ "ldr s9, [x20, x27]\n"
+ "ldr x22, [%[outptrs], 0]\n"
+ "fmla v2.4s, v18.4s, v0.4s\n"
+ "ldr s19, [x23, x27]\n"
+ "str s8, [x22, x28]\n"
+ "fmla v4.4s, v10.4s, v5.4s\n"
+ "fmla v3.4s, v13.4s, v1.4s\n"
+ "ldr x20, [%[inptrs], 192]\n"
+ "ldr x22, [%[outptrs], 8]\n"
+ "ldr x24, [%[outptrs], 16]\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v2.4s, v15.4s, v1.4s\n"
+ "ldr s16, [x20, x27]\n"
+ "fmla v4.4s, v15.4s, v7.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmax v3.4s, v3.4s, v11.4s\n"
+ "add x27, x27, #4\n"
+ "fmax v2.4s, v2.4s, v11.4s\n"
+ "fmla v4.4s, v9.4s, v6.4s\n"
+ "fmin v3.4s, v3.4s, v12.4s\n"
+ "fmin v2.4s, v2.4s, v12.4s\n"
+ "str s3, [x24, x28]\n"
+ "fmla v4.4s, v19.4s, v0.4s\n"
+ "str s2, [x22, x28]\n"
+ "ldr x24, [%[outptrs], 24]\n"
+ "fmla v4.4s, v16.4s, v1.4s\n"
+ "fmax v4.4s, v4.4s, v11.4s\n"
+ "fmin v4.4s, v4.4s, v12.4s\n"
+ "str s4, [x24, x28]\n"
+ "add x28, x28, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
-template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>;
+#endif // __aarch64__
+
+template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>;
+
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp16_fp16.cpp
new file mode 100644
index 0000000..23a99a8
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp16_fp16.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>;
+} // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index 21e8f04..2508ec7 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,931 +25,2317 @@
namespace depthwise
{
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 1, 1, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
#ifdef __aarch64__
-
template <>
template <>
-void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
- const int n_channels,
- const float* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const float* const inptr,
- const int in_row_stride,
- const int in_col_stride,
- float* const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int, const int, const int, const int, const int, const int, const int, const int
+void Conv::execute_tile<ActivationFunction::None>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
)
{
- // Copy pointers
- const float *uptr0 = inptr;
- const float *wptr0 = weights;
- float *vptr0 = outptr;
+ __asm __volatile(
+ "add x20, %[inptr0], %[input_row_stride]\n"
+ "add x13, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x24, %[outptr0], %[output_row_stride]\n"
+ "add x21, x20, %[input_row_stride]\n"
+ "add x14, x13, #64\n"
+ "add x15, x13, %[input_col_stride1]\n"
+ "add x22, x21, %[input_row_stride]\n"
+ "add x16, x15, #64\n"
+ "add x17, x15, %[input_col_stride1]\n"
+ "add x23, x22, %[input_row_stride]\n"
+ "add x18, x17, #64\n"
+ "add x25, x24, %[output_row_stride]\n"
+ "add x26, %[output_col_stride1], %[output_col_stride1]\n"
+ "and x27, %[n_channels], #3\n"
+ "lsr x28, %[n_channels], #2\n"
+ "cbz x28, 4f\n"
+ "1:\n"
+ "ldr q25, [%[wbptr]]\n"
+ "subs x28, x28, #1\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr q16, [%[wbptr], #16]\n"
+ "mov v13.16b, v25.16b\n"
+ "ldr q7, [%[wbptr], #32]\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr q6, [%[wbptr], #48]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr q5, [%[wbptr], #64]\n"
+ "mov v12.16b, v25.16b\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "mov v14.16b, v25.16b\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "mov v9.16b, v25.16b\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "mov v11.16b, v25.16b\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "mov v8.16b, v25.16b\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "ldr q26, [%[inptr0]]\n"
+ "ldr q28, [x20]\n"
+ "fmla v17.4s, v26.4s, v16.4s\n"
+ "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v13.4s, v28.4s, v16.4s\n"
+ "ldr q27, [x21]\n"
+ "fmla v15.4s, v29.4s, v16.4s\n"
+ "ldr q21, [x20, %[input_col_stride1]]\n"
+ "fmla v17.4s, v28.4s, v5.4s\n"
+ "ldr q20, [%[inptr0], x13]\n"
+ "ldr q23, [x22]\n"
+ "ldr q19, [x21, %[input_col_stride1]]\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x20, #64]\n"
+ "fmla v17.4s, v29.4s, v7.4s\n"
+ "prfm pldl1keep, [%[inptr0], x19]\n"
+ "prfm pldl1keep, [x21, #64]\n"
+ "prfm pldl1keep, [x20, x19]\n"
+ "prfm pldl1keep, [%[inptr0], x14]\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "prfm pldl1keep, [x21, x19]\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v17.4s, v27.4s, v2.4s\n"
+ "ldr q30, [x20, x13]\n"
+ "fmla v13.4s, v27.4s, v5.4s\n"
+ "ldr q29, [%[inptr0], x15]\n"
+ "fmla v10.4s, v27.4s, v16.4s\n"
+ "ldr q28, [x23]\n"
+ "fmla v17.4s, v21.4s, v4.4s\n"
+ "ldr q24, [x22, %[input_col_stride1]]\n"
+ "fmla v13.4s, v21.4s, v7.4s\n"
+ "ldr q18, [x21, x13]\n"
+ "fmla v15.4s, v21.4s, v5.4s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla v12.4s, v21.4s, v16.4s\n"
+ "ldr q22, [x20, x15]\n"
+ "fmla v17.4s, v20.4s, v6.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v15.4s, v20.4s, v7.4s\n"
+ "prfm pldl1keep, [x23, #64]\n"
+ "fmla v14.4s, v20.4s, v16.4s\n"
+ "ldr q25, [%[inptr0], x17]\n"
+ "fmla v13.4s, v23.4s, v2.4s\n"
+ "prfm pldl1keep, [x22, x19]\n"
+ "fmla v10.4s, v23.4s, v5.4s\n"
+ "ldr q26, [x23, %[input_col_stride1]]\n"
+ "fmla v17.4s, v19.4s, v1.4s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla v13.4s, v19.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x16]\n"
+ "fmla v15.4s, v19.4s, v2.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v10.4s, v19.4s, v7.4s\n"
+ "prfm pldl1keep, [x23, x19]\n"
+ "fmla v12.4s, v19.4s, v5.4s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla v9.4s, v19.4s, v16.4s\n"
+ "ldr q27, [x22, x13]\n"
+ "fmla v17.4s, v30.4s, v3.4s\n"
+ "prfm pldl1keep, [x21, x16]\n"
+ "fmla v13.4s, v30.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x18]\n"
+ "fmla v15.4s, v30.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla v12.4s, v30.4s, v7.4s\n"
+ "prfm pldl1keep, [x22, x16]\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "prfm pldl1keep, [x21, x18]\n"
+ "fmla v11.4s, v30.4s, v16.4s\n"
+ "ldr q21, [x21, x15]\n"
+ "fmla v15.4s, v29.4s, v6.4s\n"
+ "prfm pldl1keep, [x23, x16]\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "ldr q20, [x20, x17]\n"
+ "fmla v10.4s, v28.4s, v2.4s\n"
+ "ldr q19, [x23, x13]\n"
+ "fmla v13.4s, v24.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x18]\n"
+ "fmla v12.4s, v24.4s, v2.4s\n"
+ "prfm pldl1keep, [x23, x18]\n"
+ "fmla v10.4s, v24.4s, v4.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v9.4s, v24.4s, v5.4s\n"
+ "ldr q23, [x22, x15]\n"
+ "fmla v17.4s, v18.4s, v0.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v13.4s, v18.4s, v3.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v15.4s, v18.4s, v1.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "str q17, [%[outptr0]]\n"
+ "fmla v10.4s, v18.4s, v6.4s\n"
+ "fmla v12.4s, v18.4s, v4.4s\n"
+ "ldr q17, [x21, x17]\n"
+ "fmla v14.4s, v18.4s, v2.4s\n"
+ "prfm pldl1keep, [%[inptr0], x19]\n"
+ "fmla v9.4s, v18.4s, v7.4s\n"
+ "prfm pldl1keep, [%[inptr0], x14]\n"
+ "fmla v11.4s, v18.4s, v5.4s\n"
+ "add x20, x20, #16\n"
+ "fmla v8.4s, v18.4s, v16.4s\n"
+ "ldr q24, [x23, x15]\n"
+ "fmla v15.4s, v22.4s, v3.4s\n"
+ "ldr q18, [x22, x17]\n"
+ "fmla v12.4s, v22.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, #64]\n"
+ "fmla v14.4s, v22.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x19]\n"
+ "fmla v11.4s, v22.4s, v7.4s\n"
+ "ldr q22, [x23, x17]\n"
+ "fmla v10.4s, v26.4s, v1.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v14.4s, v25.4s, v6.4s\n"
+ "ldr q25, [%[wbptr]]\n"
+ "fmla v9.4s, v26.4s, v2.4s\n"
+ "ldr q16, [%[wbptr], #16]\n"
+ "fmla v13.4s, v27.4s, v0.4s\n"
+ "prfm pldl1keep, [x21, #64]\n"
+ "fmla v10.4s, v27.4s, v3.4s\n"
+ "prfm pldl1keep, [x21, x19]\n"
+ "fmla v12.4s, v27.4s, v1.4s\n"
+ "add x22, x22, #16\n"
+ "str q13, [x24]\n"
+ "fmla v9.4s, v27.4s, v4.4s\n"
+ "fmla v11.4s, v27.4s, v2.4s\n"
+ "ldr q26, [%[inptr0]]\n"
+ "fmla v8.4s, v27.4s, v5.4s\n"
+ "ldr q28, [x20]\n"
+ "fmla v15.4s, v21.4s, v0.4s\n"
+ "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v12.4s, v21.4s, v3.4s\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "fmla v14.4s, v21.4s, v1.4s\n"
+ "add x23, x23, #16\n"
+ "str q15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v9.4s, v21.4s, v6.4s\n"
+ "fmla v11.4s, v21.4s, v4.4s\n"
+ "ldr q5, [%[wbptr], #64]\n"
+ "fmla v8.4s, v21.4s, v7.4s\n"
+ "ldr q27, [x21]\n"
+ "fmla v14.4s, v20.4s, v3.4s\n"
+ "ldr q21, [x20, %[input_col_stride1]]\n"
+ "fmla v11.4s, v20.4s, v6.4s\n"
+ "ldr q20, [%[inptr0], x13]\n"
+ "fmla v10.4s, v19.4s, v0.4s\n"
+ "subs x28, x28, #1\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "fmla v8.4s, v19.4s, v2.4s\n"
+ "fmla v12.4s, v23.4s, v0.4s\n"
+ "ldr q7, [%[wbptr], #32]\n"
+ "str q10, [x25]\n"
+ "fmla v11.4s, v23.4s, v1.4s\n"
+ "fmla v9.4s, v23.4s, v3.4s\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "str q12, [x24, %[output_col_stride1]]\n"
+ "fmla v8.4s, v23.4s, v4.4s\n"
+ "fmla v14.4s, v17.4s, v0.4s\n"
+ "ldr q23, [x22]\n"
+ "fmla v11.4s, v17.4s, v3.4s\n"
+ "ldr q19, [x21, %[input_col_stride1]]\n"
+ "fmla v8.4s, v17.4s, v6.4s\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "str q14, [%[outptr0], x26]\n"
+ "fmla v9.4s, v24.4s, v0.4s\n"
+ "fmla v11.4s, v18.4s, v0.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v8.4s, v24.4s, v1.4s\n"
+ "ldr q6, [%[wbptr], #48]\n"
+ "str q9, [x25, %[output_col_stride1]]\n"
+ "mov v17.16b, v25.16b\n"
+ "str q11, [x24, x26]\n"
+ "mov v13.16b, v25.16b\n"
+ "fmla v8.4s, v18.4s, v3.4s\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "mov v15.16b, v25.16b\n"
+ "add x24, x24, #16\n"
+ "mov v10.16b, v25.16b\n"
+ "mov v12.16b, v25.16b\n"
+ "fmla v8.4s, v22.4s, v0.4s\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "mov v14.16b, v25.16b\n"
+ "mov v9.16b, v25.16b\n"
+ "mov v11.16b, v25.16b\n"
+ "fmla v17.4s, v26.4s, v16.4s\n"
+ "str q8, [x25, x26]\n"
+ "fmla v13.4s, v28.4s, v16.4s\n"
+ "mov v8.16b, v25.16b\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "fmla v17.4s, v28.4s, v5.4s\n"
+ "fmla v15.4s, v29.4s, v16.4s\n"
+ "add x25, x25, #16\n"
+ "fmla v17.4s, v29.4s, v7.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v17.4s, v27.4s, v2.4s\n"
+ "ldr q30, [x20, x13]\n"
+ "fmla v13.4s, v27.4s, v5.4s\n"
+ "ldr q29, [%[inptr0], x15]\n"
+ "fmla v10.4s, v27.4s, v16.4s\n"
+ "ldr q28, [x23]\n"
+ "fmla v17.4s, v21.4s, v4.4s\n"
+ "ldr q24, [x22, %[input_col_stride1]]\n"
+ "fmla v13.4s, v21.4s, v7.4s\n"
+ "ldr q18, [x21, x13]\n"
+ "fmla v15.4s, v21.4s, v5.4s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla v12.4s, v21.4s, v16.4s\n"
+ "ldr q22, [x20, x15]\n"
+ "fmla v17.4s, v20.4s, v6.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v15.4s, v20.4s, v7.4s\n"
+ "prfm pldl1keep, [x23, #64]\n"
+ "fmla v14.4s, v20.4s, v16.4s\n"
+ "ldr q25, [%[inptr0], x17]\n"
+ "fmla v13.4s, v23.4s, v2.4s\n"
+ "prfm pldl1keep, [x22, x19]\n"
+ "fmla v10.4s, v23.4s, v5.4s\n"
+ "ldr q26, [x23, %[input_col_stride1]]\n"
+ "fmla v17.4s, v19.4s, v1.4s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla v13.4s, v19.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x16]\n"
+ "fmla v15.4s, v19.4s, v2.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v10.4s, v19.4s, v7.4s\n"
+ "prfm pldl1keep, [x23, x19]\n"
+ "fmla v12.4s, v19.4s, v5.4s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla v9.4s, v19.4s, v16.4s\n"
+ "ldr q27, [x22, x13]\n"
+ "fmla v17.4s, v30.4s, v3.4s\n"
+ "prfm pldl1keep, [x21, x16]\n"
+ "fmla v13.4s, v30.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x18]\n"
+ "fmla v15.4s, v30.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla v12.4s, v30.4s, v7.4s\n"
+ "prfm pldl1keep, [x22, x16]\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "prfm pldl1keep, [x21, x18]\n"
+ "fmla v11.4s, v30.4s, v16.4s\n"
+ "ldr q21, [x21, x15]\n"
+ "fmla v15.4s, v29.4s, v6.4s\n"
+ "prfm pldl1keep, [x23, x16]\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "ldr q20, [x20, x17]\n"
+ "fmla v10.4s, v28.4s, v2.4s\n"
+ "ldr q19, [x23, x13]\n"
+ "fmla v13.4s, v24.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x18]\n"
+ "fmla v12.4s, v24.4s, v2.4s\n"
+ "prfm pldl1keep, [x23, x18]\n"
+ "fmla v10.4s, v24.4s, v4.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v9.4s, v24.4s, v5.4s\n"
+ "ldr q23, [x22, x15]\n"
+ "fmla v17.4s, v18.4s, v0.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v13.4s, v18.4s, v3.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v15.4s, v18.4s, v1.4s\n"
+ "add x20, x20, #16\n"
+ "str q17, [%[outptr0]]\n"
+ "fmla v10.4s, v18.4s, v6.4s\n"
+ "fmla v12.4s, v18.4s, v4.4s\n"
+ "ldr q17, [x21, x17]\n"
+ "fmla v14.4s, v18.4s, v2.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v9.4s, v18.4s, v7.4s\n"
+ "fmla v11.4s, v18.4s, v5.4s\n"
+ "fmla v8.4s, v18.4s, v16.4s\n"
+ "ldr q24, [x23, x15]\n"
+ "fmla v15.4s, v22.4s, v3.4s\n"
+ "ldr q18, [x22, x17]\n"
+ "fmla v12.4s, v22.4s, v6.4s\n"
+ "add x22, x22, #16\n"
+ "fmla v14.4s, v22.4s, v4.4s\n"
+ "fmla v11.4s, v22.4s, v7.4s\n"
+ "fmla v10.4s, v26.4s, v1.4s\n"
+ "ldr q22, [x23, x17]\n"
+ "fmla v9.4s, v26.4s, v2.4s\n"
+ "add x23, x23, #16\n"
+ "fmla v14.4s, v25.4s, v6.4s\n"
+ "fmla v13.4s, v27.4s, v0.4s\n"
+ "fmla v10.4s, v27.4s, v3.4s\n"
+ "fmla v12.4s, v27.4s, v1.4s\n"
+ "fmla v9.4s, v27.4s, v4.4s\n"
+ "fmla v11.4s, v27.4s, v2.4s\n"
+ "str q13, [x24]\n"
+ "fmla v8.4s, v27.4s, v5.4s\n"
+ "fmla v15.4s, v21.4s, v0.4s\n"
+ "fmla v12.4s, v21.4s, v3.4s\n"
+ "fmla v14.4s, v21.4s, v1.4s\n"
+ "fmla v9.4s, v21.4s, v6.4s\n"
+ "fmla v11.4s, v21.4s, v4.4s\n"
+ "fmla v8.4s, v21.4s, v7.4s\n"
+ "str q15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v10.4s, v19.4s, v0.4s\n"
+ "fmla v14.4s, v20.4s, v3.4s\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "fmla v11.4s, v20.4s, v6.4s\n"
+ "fmla v8.4s, v19.4s, v2.4s\n"
+ "str q10, [x25]\n"
+ "fmla v12.4s, v23.4s, v0.4s\n"
+ "fmla v9.4s, v23.4s, v3.4s\n"
+ "fmla v14.4s, v17.4s, v0.4s\n"
+ "fmla v11.4s, v23.4s, v1.4s\n"
+ "fmla v8.4s, v23.4s, v4.4s\n"
+ "str q12, [x24, %[output_col_stride1]]\n"
+ "fmla v9.4s, v24.4s, v0.4s\n"
+ "str q14, [%[outptr0], x26]\n"
+ "fmla v11.4s, v17.4s, v3.4s\n"
+ "fmla v8.4s, v17.4s, v6.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "str q9, [x25, %[output_col_stride1]]\n"
+ "fmla v11.4s, v18.4s, v0.4s\n"
+ "fmla v8.4s, v24.4s, v1.4s\n"
+ "str q11, [x24, x26]\n"
+ "fmla v8.4s, v18.4s, v3.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v8.4s, v22.4s, v0.4s\n"
+ "str q8, [x25, x26]\n"
+ "add x25, x25, #16\n"
+ "4:\n"
+ "cbz x27, 7f\n"
+ "ldr s25, [%[wbptr]]\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr s16, [%[wbptr], #4]\n"
+ "mov v13.16b, v25.16b\n"
+ "ldr s7, [%[wbptr], #8]\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr s6, [%[wbptr], #12]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr s5, [%[wbptr], #16]\n"
+ "mov v12.16b, v25.16b\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "mov v14.16b, v25.16b\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "mov v9.16b, v25.16b\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "mov v11.16b, v25.16b\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "mov v8.16b, v25.16b\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "ldr s26, [%[inptr0]]\n"
+ "subs x27, x27, #1\n"
+ "fmla v17.4s, v26.4s, v16.4s\n"
+ "ldr s28, [x20]\n"
+ "fmla v13.4s, v28.4s, v16.4s\n"
+ "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v15.4s, v29.4s, v16.4s\n"
+ "ldr s27, [x21]\n"
+ "fmla v17.4s, v28.4s, v5.4s\n"
+ "ldr s21, [x20, %[input_col_stride1]]\n"
+ "ldr s20, [%[inptr0], x13]\n"
+ "ldr s23, [x22]\n"
+ "ldr s19, [x21, %[input_col_stride1]]\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v17.4s, v29.4s, v7.4s\n"
+ "prfm pldl1keep, [x20, #64]\n"
+ "prfm pldl1keep, [%[inptr0], x19]\n"
+ "prfm pldl1keep, [x21, #64]\n"
+ "prfm pldl1keep, [x20, x19]\n"
+ "prfm pldl1keep, [%[inptr0], x14]\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "prfm pldl1keep, [x21, x19]\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v17.4s, v27.4s, v2.4s\n"
+ "ldr s30, [x20, x13]\n"
+ "fmla v13.4s, v27.4s, v5.4s\n"
+ "ldr s29, [%[inptr0], x15]\n"
+ "fmla v10.4s, v27.4s, v16.4s\n"
+ "ldr s28, [x23]\n"
+ "fmla v17.4s, v21.4s, v4.4s\n"
+ "ldr s24, [x22, %[input_col_stride1]]\n"
+ "fmla v13.4s, v21.4s, v7.4s\n"
+ "ldr s18, [x21, x13]\n"
+ "fmla v15.4s, v21.4s, v5.4s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla v12.4s, v21.4s, v16.4s\n"
+ "ldr s22, [x20, x15]\n"
+ "fmla v17.4s, v20.4s, v6.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v15.4s, v20.4s, v7.4s\n"
+ "prfm pldl1keep, [x23, #64]\n"
+ "fmla v14.4s, v20.4s, v16.4s\n"
+ "ldr s25, [%[inptr0], x17]\n"
+ "fmla v13.4s, v23.4s, v2.4s\n"
+ "prfm pldl1keep, [x22, x19]\n"
+ "fmla v10.4s, v23.4s, v5.4s\n"
+ "ldr s26, [x23, %[input_col_stride1]]\n"
+ "fmla v17.4s, v19.4s, v1.4s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla v13.4s, v19.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x16]\n"
+ "fmla v15.4s, v19.4s, v2.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v10.4s, v19.4s, v7.4s\n"
+ "prfm pldl1keep, [x23, x19]\n"
+ "fmla v12.4s, v19.4s, v5.4s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla v9.4s, v19.4s, v16.4s\n"
+ "ldr s27, [x22, x13]\n"
+ "fmla v17.4s, v30.4s, v3.4s\n"
+ "prfm pldl1keep, [x21, x16]\n"
+ "fmla v13.4s, v30.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x18]\n"
+ "fmla v15.4s, v30.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla v12.4s, v30.4s, v7.4s\n"
+ "prfm pldl1keep, [x22, x16]\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "prfm pldl1keep, [x21, x18]\n"
+ "fmla v11.4s, v30.4s, v16.4s\n"
+ "ldr s21, [x21, x15]\n"
+ "fmla v15.4s, v29.4s, v6.4s\n"
+ "prfm pldl1keep, [x23, x16]\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "ldr s20, [x20, x17]\n"
+ "fmla v10.4s, v28.4s, v2.4s\n"
+ "ldr s19, [x23, x13]\n"
+ "fmla v13.4s, v24.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x18]\n"
+ "fmla v12.4s, v24.4s, v2.4s\n"
+ "prfm pldl1keep, [x23, x18]\n"
+ "fmla v10.4s, v24.4s, v4.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v9.4s, v24.4s, v5.4s\n"
+ "ldr s23, [x22, x15]\n"
+ "fmla v17.4s, v18.4s, v0.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v13.4s, v18.4s, v3.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v15.4s, v18.4s, v1.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "str s17, [%[outptr0]]\n"
+ "fmla v10.4s, v18.4s, v6.4s\n"
+ "fmla v12.4s, v18.4s, v4.4s\n"
+ "ldr s17, [x21, x17]\n"
+ "fmla v14.4s, v18.4s, v2.4s\n"
+ "prfm pldl1keep, [%[inptr0], x19]\n"
+ "fmla v9.4s, v18.4s, v7.4s\n"
+ "prfm pldl1keep, [%[inptr0], x14]\n"
+ "fmla v11.4s, v18.4s, v5.4s\n"
+ "add x20, x20, #4\n"
+ "fmla v8.4s, v18.4s, v16.4s\n"
+ "ldr s24, [x23, x15]\n"
+ "fmla v15.4s, v22.4s, v3.4s\n"
+ "ldr s18, [x22, x17]\n"
+ "fmla v12.4s, v22.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, #64]\n"
+ "fmla v14.4s, v22.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x19]\n"
+ "fmla v11.4s, v22.4s, v7.4s\n"
+ "ldr s22, [x23, x17]\n"
+ "fmla v10.4s, v26.4s, v1.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v14.4s, v25.4s, v6.4s\n"
+ "ldr s25, [%[wbptr]]\n"
+ "fmla v9.4s, v26.4s, v2.4s\n"
+ "ldr s16, [%[wbptr], #4]\n"
+ "fmla v13.4s, v27.4s, v0.4s\n"
+ "prfm pldl1keep, [x21, #64]\n"
+ "fmla v10.4s, v27.4s, v3.4s\n"
+ "prfm pldl1keep, [x21, x19]\n"
+ "fmla v12.4s, v27.4s, v1.4s\n"
+ "add x22, x22, #4\n"
+ "str s13, [x24]\n"
+ "fmla v9.4s, v27.4s, v4.4s\n"
+ "fmla v11.4s, v27.4s, v2.4s\n"
+ "ldr s26, [%[inptr0]]\n"
+ "fmla v8.4s, v27.4s, v5.4s\n"
+ "ldr s28, [x20]\n"
+ "fmla v15.4s, v21.4s, v0.4s\n"
+ "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v12.4s, v21.4s, v3.4s\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "fmla v14.4s, v21.4s, v1.4s\n"
+ "add x23, x23, #4\n"
+ "str s15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v9.4s, v21.4s, v6.4s\n"
+ "fmla v11.4s, v21.4s, v4.4s\n"
+ "ldr s5, [%[wbptr], #16]\n"
+ "fmla v8.4s, v21.4s, v7.4s\n"
+ "ldr s27, [x21]\n"
+ "fmla v14.4s, v20.4s, v3.4s\n"
+ "ldr s21, [x20, %[input_col_stride1]]\n"
+ "fmla v11.4s, v20.4s, v6.4s\n"
+ "ldr s20, [%[inptr0], x13]\n"
+ "fmla v10.4s, v19.4s, v0.4s\n"
+ "subs x27, x27, #1\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "fmla v8.4s, v19.4s, v2.4s\n"
+ "fmla v12.4s, v23.4s, v0.4s\n"
+ "ldr s7, [%[wbptr], #8]\n"
+ "str s10, [x25]\n"
+ "fmla v11.4s, v23.4s, v1.4s\n"
+ "fmla v9.4s, v23.4s, v3.4s\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "str s12, [x24, %[output_col_stride1]]\n"
+ "fmla v8.4s, v23.4s, v4.4s\n"
+ "fmla v14.4s, v17.4s, v0.4s\n"
+ "ldr s23, [x22]\n"
+ "fmla v11.4s, v17.4s, v3.4s\n"
+ "ldr s19, [x21, %[input_col_stride1]]\n"
+ "fmla v8.4s, v17.4s, v6.4s\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "str s14, [%[outptr0], x26]\n"
+ "fmla v9.4s, v24.4s, v0.4s\n"
+ "fmla v11.4s, v18.4s, v0.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v8.4s, v24.4s, v1.4s\n"
+ "ldr s6, [%[wbptr], #12]\n"
+ "str s9, [x25, %[output_col_stride1]]\n"
+ "mov v17.16b, v25.16b\n"
+ "str s11, [x24, x26]\n"
+ "mov v13.16b, v25.16b\n"
+ "fmla v8.4s, v18.4s, v3.4s\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "mov v15.16b, v25.16b\n"
+ "add x24, x24, #4\n"
+ "mov v10.16b, v25.16b\n"
+ "mov v12.16b, v25.16b\n"
+ "fmla v8.4s, v22.4s, v0.4s\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "mov v14.16b, v25.16b\n"
+ "mov v9.16b, v25.16b\n"
+ "mov v11.16b, v25.16b\n"
+ "fmla v17.4s, v26.4s, v16.4s\n"
+ "str s8, [x25, x26]\n"
+ "fmla v13.4s, v28.4s, v16.4s\n"
+ "mov v8.16b, v25.16b\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "fmla v17.4s, v28.4s, v5.4s\n"
+ "fmla v15.4s, v29.4s, v16.4s\n"
+ "add x25, x25, #4\n"
+ "fmla v17.4s, v29.4s, v7.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v17.4s, v27.4s, v2.4s\n"
+ "ldr s30, [x20, x13]\n"
+ "fmla v13.4s, v27.4s, v5.4s\n"
+ "ldr s29, [%[inptr0], x15]\n"
+ "fmla v10.4s, v27.4s, v16.4s\n"
+ "ldr s28, [x23]\n"
+ "fmla v17.4s, v21.4s, v4.4s\n"
+ "ldr s24, [x22, %[input_col_stride1]]\n"
+ "fmla v13.4s, v21.4s, v7.4s\n"
+ "ldr s18, [x21, x13]\n"
+ "fmla v15.4s, v21.4s, v5.4s\n"
+ "prfm pldl1keep, [x20, x14]\n"
+ "fmla v12.4s, v21.4s, v16.4s\n"
+ "ldr s22, [x20, x15]\n"
+ "fmla v17.4s, v20.4s, v6.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v15.4s, v20.4s, v7.4s\n"
+ "prfm pldl1keep, [x23, #64]\n"
+ "fmla v14.4s, v20.4s, v16.4s\n"
+ "ldr s25, [%[inptr0], x17]\n"
+ "fmla v13.4s, v23.4s, v2.4s\n"
+ "prfm pldl1keep, [x22, x19]\n"
+ "fmla v10.4s, v23.4s, v5.4s\n"
+ "ldr s26, [x23, %[input_col_stride1]]\n"
+ "fmla v17.4s, v19.4s, v1.4s\n"
+ "prfm pldl1keep, [x21, x14]\n"
+ "fmla v13.4s, v19.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x16]\n"
+ "fmla v15.4s, v19.4s, v2.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v10.4s, v19.4s, v7.4s\n"
+ "prfm pldl1keep, [x23, x19]\n"
+ "fmla v12.4s, v19.4s, v5.4s\n"
+ "prfm pldl1keep, [x22, x14]\n"
+ "fmla v9.4s, v19.4s, v16.4s\n"
+ "ldr s27, [x22, x13]\n"
+ "fmla v17.4s, v30.4s, v3.4s\n"
+ "prfm pldl1keep, [x21, x16]\n"
+ "fmla v13.4s, v30.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x18]\n"
+ "fmla v15.4s, v30.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, x14]\n"
+ "fmla v12.4s, v30.4s, v7.4s\n"
+ "prfm pldl1keep, [x22, x16]\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "prfm pldl1keep, [x21, x18]\n"
+ "fmla v11.4s, v30.4s, v16.4s\n"
+ "ldr s21, [x21, x15]\n"
+ "fmla v15.4s, v29.4s, v6.4s\n"
+ "prfm pldl1keep, [x23, x16]\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "ldr s20, [x20, x17]\n"
+ "fmla v10.4s, v28.4s, v2.4s\n"
+ "ldr s19, [x23, x13]\n"
+ "fmla v13.4s, v24.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x18]\n"
+ "fmla v12.4s, v24.4s, v2.4s\n"
+ "prfm pldl1keep, [x23, x18]\n"
+ "fmla v10.4s, v24.4s, v4.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v9.4s, v24.4s, v5.4s\n"
+ "ldr s23, [x22, x15]\n"
+ "fmla v17.4s, v18.4s, v0.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v13.4s, v18.4s, v3.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v15.4s, v18.4s, v1.4s\n"
+ "add x20, x20, #4\n"
+ "str s17, [%[outptr0]]\n"
+ "fmla v10.4s, v18.4s, v6.4s\n"
+ "fmla v12.4s, v18.4s, v4.4s\n"
+ "ldr s17, [x21, x17]\n"
+ "fmla v14.4s, v18.4s, v2.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v9.4s, v18.4s, v7.4s\n"
+ "fmla v11.4s, v18.4s, v5.4s\n"
+ "fmla v8.4s, v18.4s, v16.4s\n"
+ "ldr s24, [x23, x15]\n"
+ "fmla v15.4s, v22.4s, v3.4s\n"
+ "ldr s18, [x22, x17]\n"
+ "fmla v12.4s, v22.4s, v6.4s\n"
+ "add x22, x22, #4\n"
+ "fmla v14.4s, v22.4s, v4.4s\n"
+ "fmla v11.4s, v22.4s, v7.4s\n"
+ "fmla v10.4s, v26.4s, v1.4s\n"
+ "ldr s22, [x23, x17]\n"
+ "fmla v9.4s, v26.4s, v2.4s\n"
+ "add x23, x23, #4\n"
+ "fmla v14.4s, v25.4s, v6.4s\n"
+ "fmla v13.4s, v27.4s, v0.4s\n"
+ "fmla v10.4s, v27.4s, v3.4s\n"
+ "fmla v12.4s, v27.4s, v1.4s\n"
+ "fmla v9.4s, v27.4s, v4.4s\n"
+ "fmla v11.4s, v27.4s, v2.4s\n"
+ "str s13, [x24]\n"
+ "fmla v8.4s, v27.4s, v5.4s\n"
+ "fmla v15.4s, v21.4s, v0.4s\n"
+ "fmla v12.4s, v21.4s, v3.4s\n"
+ "fmla v14.4s, v21.4s, v1.4s\n"
+ "fmla v9.4s, v21.4s, v6.4s\n"
+ "fmla v11.4s, v21.4s, v4.4s\n"
+ "fmla v8.4s, v21.4s, v7.4s\n"
+ "str s15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v10.4s, v19.4s, v0.4s\n"
+ "fmla v14.4s, v20.4s, v3.4s\n"
+ "fmla v9.4s, v19.4s, v1.4s\n"
+ "fmla v11.4s, v20.4s, v6.4s\n"
+ "fmla v8.4s, v19.4s, v2.4s\n"
+ "str s10, [x25]\n"
+ "fmla v12.4s, v23.4s, v0.4s\n"
+ "fmla v9.4s, v23.4s, v3.4s\n"
+ "fmla v14.4s, v17.4s, v0.4s\n"
+ "fmla v11.4s, v23.4s, v1.4s\n"
+ "fmla v8.4s, v23.4s, v4.4s\n"
+ "str s12, [x24, %[output_col_stride1]]\n"
+ "fmla v9.4s, v24.4s, v0.4s\n"
+ "str s14, [%[outptr0], x26]\n"
+ "fmla v11.4s, v17.4s, v3.4s\n"
+ "fmla v8.4s, v17.4s, v6.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "str s9, [x25, %[output_col_stride1]]\n"
+ "fmla v11.4s, v18.4s, v0.4s\n"
+ "fmla v8.4s, v24.4s, v1.4s\n"
+ "str s11, [x24, x26]\n"
+ "fmla v8.4s, v18.4s, v3.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v8.4s, v22.4s, v0.4s\n"
+ "str s8, [x25, x26]\n"
+ "add x25, x25, #4\n"
+ "7:\n"
+ : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
+ : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
- int channels_remaining = n_channels;
- if (channels_remaining >= 4)
- {
- // Process blocks of 4 channels at a time
- int n_iters = ((channels_remaining / 4) + 1)/2 - 1;
- const bool odd_tail = (channels_remaining / 4) & 1;
- channels_remaining %= 4;
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
+)
+{
+ __asm __volatile(
+ "add x25, %[inptr0], %[input_row_stride]\n"
+ "add x16, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x21, %[outptr0], %[output_row_stride]\n"
+ "add x22, x25, %[input_row_stride]\n"
+ "add x23, x16, #64\n"
+ "add x26, x16, %[input_col_stride1]\n"
+ "add x13, x22, %[input_row_stride]\n"
+ "add x20, x26, #64\n"
+ "add x18, x26, %[input_col_stride1]\n"
+ "add x24, x13, %[input_row_stride]\n"
+ "add x15, x18, #64\n"
+ "add x14, x21, %[output_row_stride]\n"
+ "add x19, %[output_col_stride1], %[output_col_stride1]\n"
+ "and x27, %[n_channels], #3\n"
+ "lsr x28, %[n_channels], #2\n"
+ "cbz x28, 4f\n"
+ "1:\n"
+ "ldr q20, [%[wbptr]]\n"
+ "subs x28, x28, #1\n"
+ "mov v4.16b, v20.16b\n"
+ "ldr q15, [%[wbptr], #16]\n"
+ "mov v1.16b, v20.16b\n"
+ "ldr q0, [%[wbptr], #32]\n"
+ "mov v3.16b, v20.16b\n"
+ "ldr q13, [%[wbptr], #48]\n"
+ "mov v7.16b, v20.16b\n"
+ "ldr q16, [%[wbptr], #64]\n"
+ "mov v9.16b, v20.16b\n"
+ "ldr q12, [%[wbptr], #80]\n"
+ "mov v2.16b, v20.16b\n"
+ "ldr q17, [%[wbptr], #96]\n"
+ "mov v6.16b, v20.16b\n"
+ "ldr q11, [%[wbptr], #112]\n"
+ "mov v8.16b, v20.16b\n"
+ "ldr q10, [%[wbptr], #128]\n"
+ "mov v5.16b, v20.16b\n"
+ "ldr q14, [%[wbptr], #144]\n"
+ "ldr q27, [%[inptr0]]\n"
+ "ldr q24, [x25]\n"
+ "fmla v4.4s, v27.4s, v15.4s\n"
+ "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr q21, [x22]\n"
+ "ldr q19, [x25, %[input_col_stride1]]\n"
+ "ldr q31, [%[inptr0], x16]\n"
+ "ldr q28, [x13]\n"
+ "fmla v4.4s, v24.4s, v16.4s\n"
+ "ldr q18, [x22, %[input_col_stride1]]\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x25, #64]\n"
+ "prfm pldl1keep, [%[inptr0], x17]\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "prfm pldl1keep, [x25, x17]\n"
+ "prfm pldl1keep, [%[inptr0], x23]\n"
+ "prfm pldl1keep, [x13, #64]\n"
+ "prfm pldl1keep, [x22, x17]\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v1.4s, v24.4s, v15.4s\n"
+ "ldr q24, [x25, x16]\n"
+ "fmla v4.4s, v22.4s, v0.4s\n"
+ "ldr q29, [%[inptr0], x26]\n"
+ "fmla v3.4s, v22.4s, v15.4s\n"
+ "ldr q30, [x24]\n"
+ "fmla v1.4s, v21.4s, v16.4s\n"
+ "ldr q25, [x13, %[input_col_stride1]]\n"
+ "fmla v4.4s, v21.4s, v11.4s\n"
+ "prfm pldl1keep, [x25, x23]\n"
+ "fmla v7.4s, v21.4s, v15.4s\n"
+ "ldr q26, [x22, x16]\n"
+ "fmla v1.4s, v19.4s, v0.4s\n"
+ "prfm pldl1keep, [%[inptr0], x20]\n"
+ "fmla v4.4s, v19.4s, v12.4s\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "fmla v3.4s, v19.4s, v16.4s\n"
+ "prfm pldl1keep, [x13, x17]\n"
+ "fmla v9.4s, v19.4s, v15.4s\n"
+ "ldr q23, [x25, x26]\n"
+ "fmla v4.4s, v31.4s, v13.4s\n"
+ "prfm pldl1keep, [x22, x23]\n"
+ "fmla v3.4s, v31.4s, v0.4s\n"
+ "prfm pldl1keep, [x25, x20]\n"
+ "fmla v2.4s, v31.4s, v15.4s\n"
+ "ldr q20, [%[inptr0], x18]\n"
+ "fmla v1.4s, v28.4s, v11.4s\n"
+ "prfm pldl1keep, [%[inptr0], x15]\n"
+ "fmla v7.4s, v28.4s, v16.4s\n"
+ "ldr q28, [x24, %[input_col_stride1]]\n"
+ "fmla v4.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x17]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "prfm pldl1keep, [x13, x23]\n"
+ "fmla v3.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x22, x20]\n"
+ "fmla v7.4s, v18.4s, v0.4s\n"
+ "prfm pldl1keep, [x25, x15]\n"
+ "fmla v9.4s, v18.4s, v16.4s\n"
+ "prfm pldl1keep, [x24, x23]\n"
+ "fmla v6.4s, v18.4s, v15.4s\n"
+ "ldr q27, [x13, x16]\n"
+ "fmla v4.4s, v24.4s, v17.4s\n"
+ "prfm pldl1keep, [x13, x20]\n"
+ "fmla v1.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [x22, x15]\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "prfm pldl1keep, [x24, x20]\n"
+ "fmla v9.4s, v24.4s, v0.4s\n"
+ "prfm pldl1keep, [x13, x15]\n"
+ "fmla v2.4s, v24.4s, v16.4s\n"
+ "prfm pldl1keep, [x24, x15]\n"
+ "fmla v8.4s, v24.4s, v15.4s\n"
+ "ldr q24, [x22, x26]\n"
+ "fmla v3.4s, v29.4s, v13.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v2.4s, v29.4s, v0.4s\n"
+ "ldr q22, [x25, x18]\n"
+ "fmla v7.4s, v30.4s, v11.4s\n"
+ "ldr q21, [x24, x16]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v9.4s, v25.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v7.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v6.4s, v25.4s, v16.4s\n"
+ "ldr q19, [x13, x26]\n"
+ "fmla v4.4s, v26.4s, v14.4s\n"
+ "prfm pldl1keep, [%[inptr0], x17]\n"
+ "fmla v1.4s, v26.4s, v17.4s\n"
+ "prfm pldl1keep, [%[inptr0], x23]\n"
+ "fmla v3.4s, v26.4s, v10.4s\n"
+ "add x25, x25, #16\n"
+ "fmla v7.4s, v26.4s, v13.4s\n"
+ "prfm pldl1keep, [x25, #64]\n"
+ "fmla v9.4s, v26.4s, v12.4s\n"
+ "prfm pldl1keep, [x25, x17]\n"
+ "fmla v2.4s, v26.4s, v11.4s\n"
+ "subs x28, x28, #1\n"
+ "fmla v6.4s, v26.4s, v0.4s\n"
+ "fmla v8.4s, v26.4s, v16.4s\n"
+ "fmla v5.4s, v26.4s, v15.4s\n"
+ "ldr q26, [x22, x18]\n"
+ "fmla v3.4s, v23.4s, v17.4s\n"
+ "ldr q18, [x24, x26]\n"
+ "fmla v9.4s, v23.4s, v13.4s\n"
+ "add x22, x22, #16\n"
+ "fmla v2.4s, v23.4s, v12.4s\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "fmla v8.4s, v23.4s, v0.4s\n"
+ "ldr q23, [x13, x18]\n"
+ "fmla v7.4s, v28.4s, v10.4s\n"
+ "prfm pldl1keep, [x22, x17]\n"
+ "fmla v2.4s, v20.4s, v13.4s\n"
+ "ldr q25, [x24, x18]\n"
+ "fmla v6.4s, v28.4s, v11.4s\n"
+ "ldr q20, [%[wbptr]]\n"
+ "fmla v1.4s, v27.4s, v14.4s\n"
+ "add x13, x13, #16\n"
+ "fmla v7.4s, v27.4s, v17.4s\n"
+ "prfm pldl1keep, [x13, #64]\n"
+ "fmla v9.4s, v27.4s, v10.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v6.4s, v27.4s, v12.4s\n"
+ "fmla v8.4s, v27.4s, v11.4s\n"
+ "fmla v5.4s, v27.4s, v16.4s\n"
+ "ldr q15, [%[wbptr], #16]\n"
+ "fmla v3.4s, v24.4s, v14.4s\n"
+ "ldr q27, [%[inptr0]]\n"
+ "fmla v9.4s, v24.4s, v17.4s\n"
+ "fmla v2.4s, v24.4s, v10.4s\n"
+ "fmla v6.4s, v24.4s, v13.4s\n"
+ "fmla v8.4s, v24.4s, v12.4s\n"
+ "fmla v5.4s, v24.4s, v0.4s\n"
+ "ldr q16, [%[wbptr], #64]\n"
+ "fmla v2.4s, v22.4s, v17.4s\n"
+ "ldr q24, [x25]\n"
+ "fmla v8.4s, v22.4s, v13.4s\n"
+ "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v7.4s, v21.4s, v14.4s\n"
+ "fmla v6.4s, v21.4s, v10.4s\n"
+ "fmla v5.4s, v21.4s, v11.4s\n"
+ "ldr q0, [%[wbptr], #32]\n"
+ "fmla v9.4s, v19.4s, v14.4s\n"
+ "ldr q21, [x22]\n"
+ "fmla v6.4s, v19.4s, v17.4s\n"
+ "fmla v8.4s, v19.4s, v10.4s\n"
+ "fmla v5.4s, v19.4s, v12.4s\n"
+ "ldr q11, [%[wbptr], #112]\n"
+ "fmla v2.4s, v26.4s, v14.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v8.4s, v26.4s, v17.4s\n"
+ "fmla v6.4s, v18.4s, v14.4s\n"
+ "fmla v5.4s, v26.4s, v13.4s\n"
+ "ldr q12, [%[wbptr], #80]\n"
+ "fmax v4.4s, v4.4s, v29.4s\n"
+ "ldr q19, [x25, %[input_col_stride1]]\n"
+ "fmla v8.4s, v23.4s, v14.4s\n"
+ "fmax v3.4s, v3.4s, v29.4s\n"
+ "str q4, [%[outptr0]]\n"
+ "fmla v5.4s, v18.4s, v10.4s\n"
+ "str q3, [%[outptr0], %[output_col_stride1]]\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "ldr q13, [%[wbptr], #48]\n"
+ "str q2, [%[outptr0], x19]\n"
+ "fmla v5.4s, v23.4s, v17.4s\n"
+ "str q1, [x21]\n"
+ "fmax v9.4s, v9.4s, v29.4s\n"
+ "fmax v8.4s, v8.4s, v29.4s\n"
+ "ldr q10, [%[wbptr], #128]\n"
+ "str q9, [x21, %[output_col_stride1]]\n"
+ "fmla v5.4s, v25.4s, v14.4s\n"
+ "str q8, [x21, x19]\n"
+ "fmax v7.4s, v7.4s, v29.4s\n"
+ "fmax v6.4s, v6.4s, v29.4s\n"
+ "ldr q17, [%[wbptr], #96]\n"
+ "str q7, [x14]\n"
+ "fmax v5.4s, v5.4s, v29.4s\n"
+ "str q6, [x14, %[output_col_stride1]]\n"
+ "mov v4.16b, v20.16b\n"
+ "str q5, [x14, x19]\n"
+ "mov v1.16b, v20.16b\n"
+ "mov v3.16b, v20.16b\n"
+ "ldr q14, [%[wbptr], #144]\n"
+ "mov v7.16b, v20.16b\n"
+ "ldr q31, [%[inptr0], x16]\n"
+ "mov v9.16b, v20.16b\n"
+ "ldr q28, [x13]\n"
+ "mov v2.16b, v20.16b\n"
+ "ldr q18, [x22, %[input_col_stride1]]\n"
+ "mov v6.16b, v20.16b\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "mov v8.16b, v20.16b\n"
+ "add x21, x21, #16\n"
+ "mov v5.16b, v20.16b\n"
+ "add x14, x14, #16\n"
+ "fmla v4.4s, v27.4s, v15.4s\n"
+ "fmla v4.4s, v24.4s, v16.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v1.4s, v24.4s, v15.4s\n"
+ "ldr q24, [x25, x16]\n"
+ "fmla v4.4s, v22.4s, v0.4s\n"
+ "ldr q29, [%[inptr0], x26]\n"
+ "fmla v3.4s, v22.4s, v15.4s\n"
+ "ldr q30, [x24]\n"
+ "fmla v1.4s, v21.4s, v16.4s\n"
+ "ldr q25, [x13, %[input_col_stride1]]\n"
+ "fmla v4.4s, v21.4s, v11.4s\n"
+ "prfm pldl1keep, [x25, x23]\n"
+ "fmla v7.4s, v21.4s, v15.4s\n"
+ "ldr q26, [x22, x16]\n"
+ "fmla v1.4s, v19.4s, v0.4s\n"
+ "prfm pldl1keep, [%[inptr0], x20]\n"
+ "fmla v4.4s, v19.4s, v12.4s\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "fmla v3.4s, v19.4s, v16.4s\n"
+ "prfm pldl1keep, [x13, x17]\n"
+ "fmla v9.4s, v19.4s, v15.4s\n"
+ "ldr q23, [x25, x26]\n"
+ "fmla v4.4s, v31.4s, v13.4s\n"
+ "prfm pldl1keep, [x22, x23]\n"
+ "fmla v3.4s, v31.4s, v0.4s\n"
+ "prfm pldl1keep, [x25, x20]\n"
+ "fmla v2.4s, v31.4s, v15.4s\n"
+ "ldr q20, [%[inptr0], x18]\n"
+ "fmla v1.4s, v28.4s, v11.4s\n"
+ "prfm pldl1keep, [%[inptr0], x15]\n"
+ "fmla v7.4s, v28.4s, v16.4s\n"
+ "ldr q28, [x24, %[input_col_stride1]]\n"
+ "fmla v4.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x17]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "prfm pldl1keep, [x13, x23]\n"
+ "fmla v3.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x22, x20]\n"
+ "fmla v7.4s, v18.4s, v0.4s\n"
+ "prfm pldl1keep, [x25, x15]\n"
+ "fmla v9.4s, v18.4s, v16.4s\n"
+ "prfm pldl1keep, [x24, x23]\n"
+ "fmla v6.4s, v18.4s, v15.4s\n"
+ "ldr q27, [x13, x16]\n"
+ "fmla v4.4s, v24.4s, v17.4s\n"
+ "prfm pldl1keep, [x13, x20]\n"
+ "fmla v1.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [x22, x15]\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "prfm pldl1keep, [x24, x20]\n"
+ "fmla v9.4s, v24.4s, v0.4s\n"
+ "prfm pldl1keep, [x13, x15]\n"
+ "fmla v2.4s, v24.4s, v16.4s\n"
+ "prfm pldl1keep, [x24, x15]\n"
+ "fmla v8.4s, v24.4s, v15.4s\n"
+ "ldr q24, [x22, x26]\n"
+ "fmla v3.4s, v29.4s, v13.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v2.4s, v29.4s, v0.4s\n"
+ "ldr q22, [x25, x18]\n"
+ "fmla v7.4s, v30.4s, v11.4s\n"
+ "ldr q21, [x24, x16]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v9.4s, v25.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v7.4s, v25.4s, v12.4s\n"
+ "add x25, x25, #16\n"
+ "fmla v6.4s, v25.4s, v16.4s\n"
+ "ldr q19, [x13, x26]\n"
+ "fmla v4.4s, v26.4s, v14.4s\n"
+ "fmla v1.4s, v26.4s, v17.4s\n"
+ "fmla v3.4s, v26.4s, v10.4s\n"
+ "fmla v7.4s, v26.4s, v13.4s\n"
+ "fmla v9.4s, v26.4s, v12.4s\n"
+ "fmla v2.4s, v26.4s, v11.4s\n"
+ "fmla v6.4s, v26.4s, v0.4s\n"
+ "fmla v8.4s, v26.4s, v16.4s\n"
+ "fmla v5.4s, v26.4s, v15.4s\n"
+ "ldr q26, [x22, x18]\n"
+ "fmla v3.4s, v23.4s, v17.4s\n"
+ "ldr q18, [x24, x26]\n"
+ "fmla v9.4s, v23.4s, v13.4s\n"
+ "add x22, x22, #16\n"
+ "fmla v2.4s, v23.4s, v12.4s\n"
+ "fmla v8.4s, v23.4s, v0.4s\n"
+ "fmla v7.4s, v28.4s, v10.4s\n"
+ "ldr q23, [x13, x18]\n"
+ "fmla v6.4s, v28.4s, v11.4s\n"
+ "ldr q25, [x24, x18]\n"
+ "fmla v2.4s, v20.4s, v13.4s\n"
+ "add x13, x13, #16\n"
+ "fmla v1.4s, v27.4s, v14.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v7.4s, v27.4s, v17.4s\n"
+ "fmla v9.4s, v27.4s, v10.4s\n"
+ "fmla v6.4s, v27.4s, v12.4s\n"
+ "fmla v8.4s, v27.4s, v11.4s\n"
+ "fmla v5.4s, v27.4s, v16.4s\n"
+ "fmla v3.4s, v24.4s, v14.4s\n"
+ "fmla v9.4s, v24.4s, v17.4s\n"
+ "fmla v2.4s, v24.4s, v10.4s\n"
+ "fmla v6.4s, v24.4s, v13.4s\n"
+ "fmla v8.4s, v24.4s, v12.4s\n"
+ "fmla v5.4s, v24.4s, v0.4s\n"
+ "fmla v7.4s, v21.4s, v14.4s\n"
+ "fmla v2.4s, v22.4s, v17.4s\n"
+ "fmla v9.4s, v19.4s, v14.4s\n"
+ "fmla v8.4s, v22.4s, v13.4s\n"
+ "fmla v6.4s, v21.4s, v10.4s\n"
+ "fmla v5.4s, v21.4s, v11.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v2.4s, v26.4s, v14.4s\n"
+ "fmla v6.4s, v19.4s, v17.4s\n"
+ "fmla v8.4s, v19.4s, v10.4s\n"
+ "fmla v5.4s, v19.4s, v12.4s\n"
+ "fmax v4.4s, v4.4s, v29.4s\n"
+ "fmax v3.4s, v3.4s, v29.4s\n"
+ "fmla v6.4s, v18.4s, v14.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "str q4, [%[outptr0]]\n"
+ "fmla v8.4s, v26.4s, v17.4s\n"
+ "str q3, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v5.4s, v26.4s, v13.4s\n"
+ "str q2, [%[outptr0], x19]\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "fmla v8.4s, v23.4s, v14.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "str q1, [x21]\n"
+ "fmla v5.4s, v18.4s, v10.4s\n"
+ "fmax v9.4s, v9.4s, v29.4s\n"
+ "fmax v7.4s, v7.4s, v29.4s\n"
+ "fmax v8.4s, v8.4s, v29.4s\n"
+ "fmax v6.4s, v6.4s, v29.4s\n"
+ "str q9, [x21, %[output_col_stride1]]\n"
+ "fmla v5.4s, v23.4s, v17.4s\n"
+ "str q8, [x21, x19]\n"
+ "str q7, [x14]\n"
+ "str q6, [x14, %[output_col_stride1]]\n"
+ "add x21, x21, #16\n"
+ "fmla v5.4s, v25.4s, v14.4s\n"
+ "fmax v5.4s, v5.4s, v29.4s\n"
+ "str q5, [x14, x19]\n"
+ "add x14, x14, #16\n"
+ "4:\n"
+ "cbz x27, 7f\n"
+ "ldr s20, [%[wbptr]]\n"
+ "mov v4.16b, v20.16b\n"
+ "ldr s15, [%[wbptr], #4]\n"
+ "mov v1.16b, v20.16b\n"
+ "ldr s0, [%[wbptr], #8]\n"
+ "mov v3.16b, v20.16b\n"
+ "ldr s13, [%[wbptr], #12]\n"
+ "mov v7.16b, v20.16b\n"
+ "ldr s16, [%[wbptr], #16]\n"
+ "mov v9.16b, v20.16b\n"
+ "ldr s12, [%[wbptr], #20]\n"
+ "mov v2.16b, v20.16b\n"
+ "ldr s17, [%[wbptr], #24]\n"
+ "mov v6.16b, v20.16b\n"
+ "ldr s11, [%[wbptr], #28]\n"
+ "mov v8.16b, v20.16b\n"
+ "ldr s10, [%[wbptr], #32]\n"
+ "mov v5.16b, v20.16b\n"
+ "ldr s14, [%[wbptr], #36]\n"
+ "ldr s27, [%[inptr0]]\n"
+ "subs x27, x27, #1\n"
+ "fmla v4.4s, v27.4s, v15.4s\n"
+ "ldr s24, [x25]\n"
+ "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr s21, [x22]\n"
+ "ldr s19, [x25, %[input_col_stride1]]\n"
+ "ldr s31, [%[inptr0], x16]\n"
+ "fmla v4.4s, v24.4s, v16.4s\n"
+ "ldr s28, [x13]\n"
+ "ldr s18, [x22, %[input_col_stride1]]\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x25, #64]\n"
+ "prfm pldl1keep, [%[inptr0], x17]\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "prfm pldl1keep, [x25, x17]\n"
+ "prfm pldl1keep, [%[inptr0], x23]\n"
+ "prfm pldl1keep, [x13, #64]\n"
+ "prfm pldl1keep, [x22, x17]\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v1.4s, v24.4s, v15.4s\n"
+ "ldr s24, [x25, x16]\n"
+ "fmla v4.4s, v22.4s, v0.4s\n"
+ "ldr s29, [%[inptr0], x26]\n"
+ "fmla v3.4s, v22.4s, v15.4s\n"
+ "ldr s30, [x24]\n"
+ "fmla v1.4s, v21.4s, v16.4s\n"
+ "ldr s25, [x13, %[input_col_stride1]]\n"
+ "fmla v4.4s, v21.4s, v11.4s\n"
+ "prfm pldl1keep, [x25, x23]\n"
+ "fmla v7.4s, v21.4s, v15.4s\n"
+ "ldr s26, [x22, x16]\n"
+ "fmla v1.4s, v19.4s, v0.4s\n"
+ "prfm pldl1keep, [%[inptr0], x20]\n"
+ "fmla v4.4s, v19.4s, v12.4s\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "fmla v3.4s, v19.4s, v16.4s\n"
+ "prfm pldl1keep, [x13, x17]\n"
+ "fmla v9.4s, v19.4s, v15.4s\n"
+ "ldr s23, [x25, x26]\n"
+ "fmla v4.4s, v31.4s, v13.4s\n"
+ "prfm pldl1keep, [x22, x23]\n"
+ "fmla v3.4s, v31.4s, v0.4s\n"
+ "prfm pldl1keep, [x25, x20]\n"
+ "fmla v2.4s, v31.4s, v15.4s\n"
+ "ldr s20, [%[inptr0], x18]\n"
+ "fmla v1.4s, v28.4s, v11.4s\n"
+ "prfm pldl1keep, [%[inptr0], x15]\n"
+ "fmla v7.4s, v28.4s, v16.4s\n"
+ "ldr s28, [x24, %[input_col_stride1]]\n"
+ "fmla v4.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x17]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "prfm pldl1keep, [x13, x23]\n"
+ "fmla v3.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x22, x20]\n"
+ "fmla v7.4s, v18.4s, v0.4s\n"
+ "prfm pldl1keep, [x25, x15]\n"
+ "fmla v9.4s, v18.4s, v16.4s\n"
+ "prfm pldl1keep, [x24, x23]\n"
+ "fmla v6.4s, v18.4s, v15.4s\n"
+ "ldr s27, [x13, x16]\n"
+ "fmla v4.4s, v24.4s, v17.4s\n"
+ "prfm pldl1keep, [x13, x20]\n"
+ "fmla v1.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [x22, x15]\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "prfm pldl1keep, [x24, x20]\n"
+ "fmla v9.4s, v24.4s, v0.4s\n"
+ "prfm pldl1keep, [x13, x15]\n"
+ "fmla v2.4s, v24.4s, v16.4s\n"
+ "prfm pldl1keep, [x24, x15]\n"
+ "fmla v8.4s, v24.4s, v15.4s\n"
+ "ldr s24, [x22, x26]\n"
+ "fmla v3.4s, v29.4s, v13.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v2.4s, v29.4s, v0.4s\n"
+ "ldr s22, [x25, x18]\n"
+ "fmla v7.4s, v30.4s, v11.4s\n"
+ "ldr s21, [x24, x16]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v9.4s, v25.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v7.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v6.4s, v25.4s, v16.4s\n"
+ "ldr s19, [x13, x26]\n"
+ "fmla v4.4s, v26.4s, v14.4s\n"
+ "prfm pldl1keep, [%[inptr0], x17]\n"
+ "fmla v1.4s, v26.4s, v17.4s\n"
+ "prfm pldl1keep, [%[inptr0], x23]\n"
+ "fmla v3.4s, v26.4s, v10.4s\n"
+ "add x25, x25, #4\n"
+ "fmla v7.4s, v26.4s, v13.4s\n"
+ "prfm pldl1keep, [x25, #64]\n"
+ "fmla v9.4s, v26.4s, v12.4s\n"
+ "prfm pldl1keep, [x25, x17]\n"
+ "fmla v2.4s, v26.4s, v11.4s\n"
+ "subs x27, x27, #1\n"
+ "fmla v6.4s, v26.4s, v0.4s\n"
+ "fmla v8.4s, v26.4s, v16.4s\n"
+ "fmla v5.4s, v26.4s, v15.4s\n"
+ "ldr s26, [x22, x18]\n"
+ "fmla v3.4s, v23.4s, v17.4s\n"
+ "ldr s18, [x24, x26]\n"
+ "fmla v9.4s, v23.4s, v13.4s\n"
+ "add x22, x22, #4\n"
+ "fmla v2.4s, v23.4s, v12.4s\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "fmla v8.4s, v23.4s, v0.4s\n"
+ "ldr s23, [x13, x18]\n"
+ "fmla v7.4s, v28.4s, v10.4s\n"
+ "prfm pldl1keep, [x22, x17]\n"
+ "fmla v2.4s, v20.4s, v13.4s\n"
+ "ldr s25, [x24, x18]\n"
+ "fmla v6.4s, v28.4s, v11.4s\n"
+ "ldr s20, [%[wbptr]]\n"
+ "fmla v1.4s, v27.4s, v14.4s\n"
+ "add x13, x13, #4\n"
+ "fmla v7.4s, v27.4s, v17.4s\n"
+ "prfm pldl1keep, [x13, #64]\n"
+ "fmla v9.4s, v27.4s, v10.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v6.4s, v27.4s, v12.4s\n"
+ "fmla v8.4s, v27.4s, v11.4s\n"
+ "fmla v5.4s, v27.4s, v16.4s\n"
+ "ldr s15, [%[wbptr], #4]\n"
+ "fmla v3.4s, v24.4s, v14.4s\n"
+ "ldr s27, [%[inptr0]]\n"
+ "fmla v9.4s, v24.4s, v17.4s\n"
+ "fmla v2.4s, v24.4s, v10.4s\n"
+ "fmla v6.4s, v24.4s, v13.4s\n"
+ "fmla v8.4s, v24.4s, v12.4s\n"
+ "fmla v5.4s, v24.4s, v0.4s\n"
+ "ldr s16, [%[wbptr], #16]\n"
+ "fmla v2.4s, v22.4s, v17.4s\n"
+ "ldr s24, [x25]\n"
+ "fmla v8.4s, v22.4s, v13.4s\n"
+ "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v7.4s, v21.4s, v14.4s\n"
+ "fmla v6.4s, v21.4s, v10.4s\n"
+ "fmla v5.4s, v21.4s, v11.4s\n"
+ "ldr s0, [%[wbptr], #8]\n"
+ "fmla v9.4s, v19.4s, v14.4s\n"
+ "ldr s21, [x22]\n"
+ "fmla v6.4s, v19.4s, v17.4s\n"
+ "fmla v8.4s, v19.4s, v10.4s\n"
+ "fmla v5.4s, v19.4s, v12.4s\n"
+ "ldr s11, [%[wbptr], #28]\n"
+ "fmla v2.4s, v26.4s, v14.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v8.4s, v26.4s, v17.4s\n"
+ "fmla v6.4s, v18.4s, v14.4s\n"
+ "fmla v5.4s, v26.4s, v13.4s\n"
+ "ldr s12, [%[wbptr], #20]\n"
+ "fmax v4.4s, v4.4s, v29.4s\n"
+ "ldr s19, [x25, %[input_col_stride1]]\n"
+ "fmla v8.4s, v23.4s, v14.4s\n"
+ "fmax v3.4s, v3.4s, v29.4s\n"
+ "str s4, [%[outptr0]]\n"
+ "fmla v5.4s, v18.4s, v10.4s\n"
+ "str s3, [%[outptr0], %[output_col_stride1]]\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "ldr s13, [%[wbptr], #12]\n"
+ "str s2, [%[outptr0], x19]\n"
+ "fmla v5.4s, v23.4s, v17.4s\n"
+ "str s1, [x21]\n"
+ "fmax v9.4s, v9.4s, v29.4s\n"
+ "fmax v8.4s, v8.4s, v29.4s\n"
+ "ldr s10, [%[wbptr], #32]\n"
+ "str s9, [x21, %[output_col_stride1]]\n"
+ "fmla v5.4s, v25.4s, v14.4s\n"
+ "str s8, [x21, x19]\n"
+ "fmax v7.4s, v7.4s, v29.4s\n"
+ "fmax v6.4s, v6.4s, v29.4s\n"
+ "ldr s17, [%[wbptr], #24]\n"
+ "str s7, [x14]\n"
+ "fmax v5.4s, v5.4s, v29.4s\n"
+ "str s6, [x14, %[output_col_stride1]]\n"
+ "mov v4.16b, v20.16b\n"
+ "str s5, [x14, x19]\n"
+ "mov v1.16b, v20.16b\n"
+ "mov v3.16b, v20.16b\n"
+ "ldr s14, [%[wbptr], #36]\n"
+ "mov v7.16b, v20.16b\n"
+ "ldr s31, [%[inptr0], x16]\n"
+ "mov v9.16b, v20.16b\n"
+ "ldr s28, [x13]\n"
+ "mov v2.16b, v20.16b\n"
+ "ldr s18, [x22, %[input_col_stride1]]\n"
+ "mov v6.16b, v20.16b\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "mov v8.16b, v20.16b\n"
+ "add x21, x21, #4\n"
+ "mov v5.16b, v20.16b\n"
+ "add x14, x14, #4\n"
+ "fmla v4.4s, v27.4s, v15.4s\n"
+ "fmla v4.4s, v24.4s, v16.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v1.4s, v24.4s, v15.4s\n"
+ "ldr s24, [x25, x16]\n"
+ "fmla v4.4s, v22.4s, v0.4s\n"
+ "ldr s29, [%[inptr0], x26]\n"
+ "fmla v3.4s, v22.4s, v15.4s\n"
+ "ldr s30, [x24]\n"
+ "fmla v1.4s, v21.4s, v16.4s\n"
+ "ldr s25, [x13, %[input_col_stride1]]\n"
+ "fmla v4.4s, v21.4s, v11.4s\n"
+ "prfm pldl1keep, [x25, x23]\n"
+ "fmla v7.4s, v21.4s, v15.4s\n"
+ "ldr s26, [x22, x16]\n"
+ "fmla v1.4s, v19.4s, v0.4s\n"
+ "prfm pldl1keep, [%[inptr0], x20]\n"
+ "fmla v4.4s, v19.4s, v12.4s\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "fmla v3.4s, v19.4s, v16.4s\n"
+ "prfm pldl1keep, [x13, x17]\n"
+ "fmla v9.4s, v19.4s, v15.4s\n"
+ "ldr s23, [x25, x26]\n"
+ "fmla v4.4s, v31.4s, v13.4s\n"
+ "prfm pldl1keep, [x22, x23]\n"
+ "fmla v3.4s, v31.4s, v0.4s\n"
+ "prfm pldl1keep, [x25, x20]\n"
+ "fmla v2.4s, v31.4s, v15.4s\n"
+ "ldr s20, [%[inptr0], x18]\n"
+ "fmla v1.4s, v28.4s, v11.4s\n"
+ "prfm pldl1keep, [%[inptr0], x15]\n"
+ "fmla v7.4s, v28.4s, v16.4s\n"
+ "ldr s28, [x24, %[input_col_stride1]]\n"
+ "fmla v4.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x17]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "prfm pldl1keep, [x13, x23]\n"
+ "fmla v3.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x22, x20]\n"
+ "fmla v7.4s, v18.4s, v0.4s\n"
+ "prfm pldl1keep, [x25, x15]\n"
+ "fmla v9.4s, v18.4s, v16.4s\n"
+ "prfm pldl1keep, [x24, x23]\n"
+ "fmla v6.4s, v18.4s, v15.4s\n"
+ "ldr s27, [x13, x16]\n"
+ "fmla v4.4s, v24.4s, v17.4s\n"
+ "prfm pldl1keep, [x13, x20]\n"
+ "fmla v1.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [x22, x15]\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "prfm pldl1keep, [x24, x20]\n"
+ "fmla v9.4s, v24.4s, v0.4s\n"
+ "prfm pldl1keep, [x13, x15]\n"
+ "fmla v2.4s, v24.4s, v16.4s\n"
+ "prfm pldl1keep, [x24, x15]\n"
+ "fmla v8.4s, v24.4s, v15.4s\n"
+ "ldr s24, [x22, x26]\n"
+ "fmla v3.4s, v29.4s, v13.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v2.4s, v29.4s, v0.4s\n"
+ "ldr s22, [x25, x18]\n"
+ "fmla v7.4s, v30.4s, v11.4s\n"
+ "ldr s21, [x24, x16]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v9.4s, v25.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v7.4s, v25.4s, v12.4s\n"
+ "add x25, x25, #4\n"
+ "fmla v6.4s, v25.4s, v16.4s\n"
+ "ldr s19, [x13, x26]\n"
+ "fmla v4.4s, v26.4s, v14.4s\n"
+ "fmla v1.4s, v26.4s, v17.4s\n"
+ "fmla v3.4s, v26.4s, v10.4s\n"
+ "fmla v7.4s, v26.4s, v13.4s\n"
+ "fmla v9.4s, v26.4s, v12.4s\n"
+ "fmla v2.4s, v26.4s, v11.4s\n"
+ "fmla v6.4s, v26.4s, v0.4s\n"
+ "fmla v8.4s, v26.4s, v16.4s\n"
+ "fmla v5.4s, v26.4s, v15.4s\n"
+ "ldr s26, [x22, x18]\n"
+ "fmla v3.4s, v23.4s, v17.4s\n"
+ "ldr s18, [x24, x26]\n"
+ "fmla v9.4s, v23.4s, v13.4s\n"
+ "add x22, x22, #4\n"
+ "fmla v2.4s, v23.4s, v12.4s\n"
+ "fmla v8.4s, v23.4s, v0.4s\n"
+ "fmla v7.4s, v28.4s, v10.4s\n"
+ "ldr s23, [x13, x18]\n"
+ "fmla v6.4s, v28.4s, v11.4s\n"
+ "ldr s25, [x24, x18]\n"
+ "fmla v2.4s, v20.4s, v13.4s\n"
+ "add x13, x13, #4\n"
+ "fmla v1.4s, v27.4s, v14.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v7.4s, v27.4s, v17.4s\n"
+ "fmla v9.4s, v27.4s, v10.4s\n"
+ "fmla v6.4s, v27.4s, v12.4s\n"
+ "fmla v8.4s, v27.4s, v11.4s\n"
+ "fmla v5.4s, v27.4s, v16.4s\n"
+ "fmla v3.4s, v24.4s, v14.4s\n"
+ "fmla v9.4s, v24.4s, v17.4s\n"
+ "fmla v2.4s, v24.4s, v10.4s\n"
+ "fmla v6.4s, v24.4s, v13.4s\n"
+ "fmla v8.4s, v24.4s, v12.4s\n"
+ "fmla v5.4s, v24.4s, v0.4s\n"
+ "fmla v7.4s, v21.4s, v14.4s\n"
+ "fmla v2.4s, v22.4s, v17.4s\n"
+ "fmla v9.4s, v19.4s, v14.4s\n"
+ "fmla v8.4s, v22.4s, v13.4s\n"
+ "fmla v6.4s, v21.4s, v10.4s\n"
+ "fmla v5.4s, v21.4s, v11.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v2.4s, v26.4s, v14.4s\n"
+ "fmla v6.4s, v19.4s, v17.4s\n"
+ "fmla v8.4s, v19.4s, v10.4s\n"
+ "fmla v5.4s, v19.4s, v12.4s\n"
+ "fmax v4.4s, v4.4s, v29.4s\n"
+ "fmax v3.4s, v3.4s, v29.4s\n"
+ "fmla v6.4s, v18.4s, v14.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "str s4, [%[outptr0]]\n"
+ "fmla v8.4s, v26.4s, v17.4s\n"
+ "str s3, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v5.4s, v26.4s, v13.4s\n"
+ "str s2, [%[outptr0], x19]\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "fmla v8.4s, v23.4s, v14.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "str s1, [x21]\n"
+ "fmla v5.4s, v18.4s, v10.4s\n"
+ "fmax v9.4s, v9.4s, v29.4s\n"
+ "fmax v7.4s, v7.4s, v29.4s\n"
+ "fmax v8.4s, v8.4s, v29.4s\n"
+ "fmax v6.4s, v6.4s, v29.4s\n"
+ "str s9, [x21, %[output_col_stride1]]\n"
+ "fmla v5.4s, v23.4s, v17.4s\n"
+ "str s8, [x21, x19]\n"
+ "str s7, [x14]\n"
+ "str s6, [x14, %[output_col_stride1]]\n"
+ "add x21, x21, #4\n"
+ "fmla v5.4s, v25.4s, v14.4s\n"
+ "fmax v5.4s, v5.4s, v29.4s\n"
+ "str s5, [x14, x19]\n"
+ "add x14, x14, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
+ : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
- asm volatile (
- "qU22B .req q0\n" "qU23B .req q0\n" "qW22A .req q0\n"
- "vU22B .req v0\n" "vU23B .req v0\n" "vW22A .req v0\n"
- "qV12A .req q1\n" "qW11B .req q1\n"
- "vV12A .req v1\n" "vW11B .req v1\n"
- "qU41A .req q2\n" "qU32B .req q2\n" "qU33A .req q2\n" "qV13B .req q2\n"
- "vU41A .req v2\n" "vU32B .req v2\n" "vU33A .req v2\n" "vV13B .req v2\n"
- "qU42B .req q3\n" "qU13B .req q3\n" "qU44B .req q3\n" "qU55A .req q3\n"
- "vU42B .req v3\n" "vU13B .req v3\n" "vU44B .req v3\n" "vU55A .req v3\n"
- "qU34B .req q4\n" "qU15A .req q4\n" "qU42A .req q4\n" "qU44A .req q4\n" "qU12B .req q4\n"
- "vU34B .req v4\n" "vU15A .req v4\n" "vU42A .req v4\n" "vU44A .req v4\n" "vU12B .req v4\n"
- "qU33B .req q5\n" "qU52A .req q5\n" "qW23A .req q5\n"
- "vU33B .req v5\n" "vU52A .req v5\n" "vW23A .req v5\n"
- "qV31A .req q6\n" "qU13A .req q6\n" "qV12B .req q6\n"
- "vV31A .req v6\n" "vU13A .req v6\n" "vV12B .req v6\n"
- "qU35B .req q7\n" "qU51B .req q7\n" "qV11A .req q7\n" "qU53B .req q7\n"
- "vU35B .req v7\n" "vU51B .req v7\n" "vV11A .req v7\n" "vU53B .req v7\n"
- "qW21A .req q8\n" "qV22B .req q8\n"
- "vW21A .req v8\n" "vV22B .req v8\n"
- "qV33B .req q9\n" "qU14A .req q9\n" "qV23A .req q9\n" "qU25B .req q9\n"
- "vV33B .req v9\n" "vU14A .req v9\n" "vV23A .req v9\n" "vU25B .req v9\n"
- "qW21B .req q10\n" "qV32A .req q10\n" "qU35A .req q10\n"
- "vW21B .req v10\n" "vV32A .req v10\n" "vU35A .req v10\n"
- "qV11B .req q11\n" "qU15B .req q11\n" "qV33A .req q11\n"
- "vV11B .req v11\n" "vU15B .req v11\n" "vV33A .req v11\n"
- "qU11B .req q12\n" "qW23B .req q12\n" "qU45A .req q12\n"
- "vU11B .req v12\n" "vW23B .req v12\n" "vU45A .req v12\n"
- "qW11A .req q13\n" "qU45B .req q13\n" "qU52B .req q13\n"
- "vW11A .req v13\n" "vU45B .req v13\n" "vU52B .req v13\n"
- "qU55B .req q14\n" "qU25A .req q14\n" "qV21A .req q14\n"
- "vU55B .req v14\n" "vU25A .req v14\n" "vV21A .req v14\n"
- "qU53A .req q15\n" "qV21B .req q15\n" "qU31A .req q15\n"
- "vU53A .req v15\n" "vV21B .req v15\n" "vU31A .req v15\n"
- "qW13B .req q16\n" "qU23A .req q16\n"
- "vW13B .req v16\n" "vU23A .req v16\n"
- "qW33B .req q17\n" "qW33A .req q17\n"
- "vW33B .req v17\n" "vW33A .req v17\n"
- "qU24B .req q18\n" "qU32A .req q18\n" "qV31B .req q18\n" "qV13A .req q18\n"
- "vU24B .req v18\n" "vU32A .req v18\n" "vV31B .req v18\n" "vV13A .req v18\n"
- "qU31B .req q19\n" "qU11A .req q19\n" "qU54B .req q19\n" "qU43A .req q19\n"
- "vU31B .req v19\n" "vU11A .req v19\n" "vU54B .req v19\n" "vU43A .req v19\n"
- "qU24A .req q20\n" "qW12B .req q20\n" "qU54A .req q20\n"
- "vU24A .req v20\n" "vW12B .req v20\n" "vU54A .req v20\n"
- "qV23B .req q21\n" "qW12A .req q21\n"
- "vV23B .req v21\n" "vW12A .req v21\n"
- "qW32A .req q22\n" "qU43B .req q22\n"
- "vW32A .req v22\n" "vU43B .req v22\n"
- "qW31A .req q23\n" "qV32B .req q23\n"
- "vW31A .req v23\n" "vV32B .req v23\n"
- "qU22A .req q24\n" "qW31B .req q24\n"
- "vU22A .req v24\n" "vW31B .req v24\n"
- "qU21B .req q25\n" "qV22A .req q25\n"
- "vU21B .req v25\n" "vV22A .req v25\n"
- "qU34A .req q26\n" "qW22B .req q26\n" "qU12A .req q26\n"
- "vU34A .req v26\n" "vW22B .req v26\n" "vU12A .req v26\n"
- "qW13A .req q27\n" "qU51A .req q27\n"
- "vW13A .req v27\n" "vU51A .req v27\n"
- "qW32B .req q28\n"
- "vW32B .req v28\n"
- "qU41B .req q29\n" "qU14B .req q29\n"
- "vU41B .req v29\n" "vU14B .req v29\n"
- "qU21A .req q30\n"
- "vU21A .req v30\n"
-
- "uptr1 .req x0\n"
- "uptr2 .req x1\n"
- "uptr3 .req x2\n"
- "uptr4 .req x3\n"
-
- "u_col_stride1 .req %x[u_col_stride]\n"
- "u_col_stride2 .req x4\n"
- "u_col_stride3 .req x5\n"
- "u_col_stride4 .req x6\n"
-
- "wptr1 .req x7\n"
- "wptr2 .req x8\n"
- "w_col_stride1 .req %x[w_col_stride]\n"
- "w_col_stride2 .req x9\n"
-
- "vptr1 .req x10\n"
- "vptr2 .req x11\n"
- "v_col_stride1 .req %x[v_col_stride]\n"
- "v_col_stride2 .req x12\n"
-
- // Prepare strides and pointers
- "add uptr1, %x[uptr0], %x[u_row_stride]\n"
- "add uptr2, uptr1 , %x[u_row_stride]\n"
- "add uptr3, uptr2 , %x[u_row_stride]\n"
- "add uptr4, uptr3 , %x[u_row_stride]\n"
- "add u_col_stride2, u_col_stride1, u_col_stride1\n"
- "add u_col_stride3, u_col_stride2, u_col_stride1\n"
- "add u_col_stride4, u_col_stride3, u_col_stride1\n"
-
- "add wptr1, %x[wptr0], %x[w_row_stride]\n"
- "add wptr2, wptr1 , %x[w_row_stride]\n"
- "add w_col_stride2, w_col_stride1, w_col_stride1\n"
-
- "add vptr1, %x[vptr0], %x[v_row_stride]\n"
- "add vptr2, vptr1 , %x[v_row_stride]\n"
- "add v_col_stride2, v_col_stride1, v_col_stride1\n"
-
- // Pre-load for A
- "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
- "ldr qW23A, [wptr1, w_col_stride2]\n"
- "ldr qW33A, [wptr2, w_col_stride2]\n"
- "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
- "ldr qU15A, [%x[uptr0], u_col_stride4]\n"
- "ldr qW22A, [wptr1, w_col_stride1]\n"
- "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
- "ldr qW32A, [wptr2, w_col_stride1]\n"
- "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
- "ldr qU25A, [uptr1, u_col_stride4]\n"
- "ldr qU24A, [uptr1, u_col_stride3]\n"
- "ldr qW11A, [%x[wptr0]], #0x10\n"
- "ldr qU23A, [uptr1, u_col_stride2]\n"
- "ldr qW21A, [wptr1], #0x10\n"
- "ldr qW31A, [wptr2], #0x10\n"
- "ldr qU34A, [uptr2, u_col_stride3]\n"
- "ldr qU35A, [uptr2, u_col_stride4]\n"
-
- // First part of A
- "fmul vV13A.4s, vU15A.4s, vW13A.4s\n"
- "ldr qU33A, [uptr2, u_col_stride2]\n"
- "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
- "cbz %x[n_iters], 2f\n" // Jump to tail if not looping
-
- "1:" // Main loop, double unrolled
- // A Part
- "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
- "ldr qU45A, [uptr3, u_col_stride4]\n"
- "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
- "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
- "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
- "ldr qU44A, [uptr3, u_col_stride3]\n"
- "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
- "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
- "ldr qU43A, [uptr3, u_col_stride2]\n"
- "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
- "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
- "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
- "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
- "ldr qU55A, [uptr4, u_col_stride4]\n"
- "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
- "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
- "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
- "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
- "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
- "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
- "ldr qU54A, [uptr4, u_col_stride3]\n"
- "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
- "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
- "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
- "ldr qU53A, [uptr4, u_col_stride2]\n"
- "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
- "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
- "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
- "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
- "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
- "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
- "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
- "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
- "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
- "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
- "str qV13A, [%x[vptr0], v_col_stride2]\n"
- "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
- "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
- "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
- "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
- "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
- "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
- "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
- "ldr qU22A, [uptr1, u_col_stride1]\n"
- "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
- "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
- "ldr qU32A, [uptr2, u_col_stride1]\n"
- "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
- "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
- "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
- "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
- "ldr qU42A, [uptr3, u_col_stride1]\n"
- "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
- "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
- "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
- "str qV23A, [vptr1, v_col_stride2]\n"
- "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
- "ldr qW23B, [wptr1, w_col_stride2]\n"
- "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
- "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
- "ldr qU52A, [uptr4, u_col_stride1]\n"
- "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
- "ldr qU11A, [%x[uptr0]], #0x10\n"
- "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
- "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
- "ldr qU21A, [uptr1], #0x10\n"
- "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
- "ldr qW33B, [wptr2, w_col_stride2]\n"
- "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
- "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
- "str qV33A, [vptr2, v_col_stride2]\n"
- "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
- "ldr qU31A, [uptr2], #0x10\n"
- "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
- "ldr qU41A, [uptr3], #0x10\n"
- "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
- "ldr qU51A, [uptr4], #0x10\n"
- "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
- "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
- "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
- "ldr qU15B, [%x[uptr0], u_col_stride4]\n"
- "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
- "ldr qW22B, [wptr1, w_col_stride1]\n"
- "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
- "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
- "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
- "str qV12A, [%x[vptr0], v_col_stride1]\n"
- "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
- "ldr qW32B, [wptr2, w_col_stride1]\n"
- "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
- "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
- "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
- "ldr qU25B, [uptr1, u_col_stride4]\n"
- "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
- "ldr qU24B, [uptr1, u_col_stride3]\n"
- "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
- "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
- "str qV22A, [vptr1, v_col_stride1]\n"
- "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
- "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
- "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
- "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
- "str qV32A, [vptr2, v_col_stride1]\n"
- "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
- "ldr qW11B, [%x[wptr0]], #0x10\n"
- "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
- "ldr qU23B, [uptr1, u_col_stride2]\n"
- "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
- "ldr qW21B, [wptr1], #0x10\n"
- "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
- "str qV11A, [%x[vptr0]], #0x10\n"
- "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
- "ldr qW31B, [wptr2], #0x10\n"
- "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
- "ldr qU34B, [uptr2, u_col_stride3]\n"
- "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
- "str qV21A, [vptr1], #0x10\n"
- "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
- "ldr qU35B, [uptr2, u_col_stride4]\n"
- "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
- "str qV31A, [vptr2], #0x10\n"
-
- // B Part
- "fmul vV13B.4s, vU15B.4s, vW13B.4s\n"
- "ldr qU33B, [uptr2, u_col_stride2]\n"
- "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
- "fmla vV13B.4s, vU14B.4s, vW12B.4s\n"
- "ldr qU45B, [uptr3, u_col_stride4]\n"
- "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
- "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
- "fmla vV13B.4s, vU13B.4s, vW11B.4s\n"
- "ldr qU44B, [uptr3, u_col_stride3]\n"
- "fmla vV13B.4s, vU25B.4s, vW23B.4s\n"
- "fmul vV23B.4s, vU25B.4s, vW13B.4s\n"
- "ldr qU43B, [uptr3, u_col_stride2]\n"
- "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
- "fmla vV13B.4s, vU24B.4s, vW22B.4s\n"
- "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
- "fmla vV23B.4s, vU24B.4s, vW12B.4s\n"
- "ldr qU55B, [uptr4, u_col_stride4]\n"
- "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
- "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
- "fmla vV13B.4s, vU23B.4s, vW21B.4s\n"
- "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
- "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
- "fmla vV23B.4s, vU23B.4s, vW11B.4s\n"
- "ldr qU54B, [uptr4, u_col_stride3]\n"
- "fmla vV13B.4s, vU35B.4s, vW33B.4s\n"
- "fmla vV23B.4s, vU35B.4s, vW23B.4s\n"
- "fmul vV33B.4s, vU35B.4s, vW13B.4s\n"
- "ldr qU53B, [uptr4, u_col_stride2]\n"
- "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
- "fmla vV13B.4s, vU34B.4s, vW32B.4s\n"
- "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
- "fmla vV23B.4s, vU34B.4s, vW22B.4s\n"
- "fmul vV32B.4s, vU34B.4s, vW13B.4s\n"
- "fmla vV33B.4s, vU34B.4s, vW12B.4s\n"
- "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
- "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
- "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
- "fmla vV13B.4s, vU33B.4s, vW31B.4s\n"
- "str qV13B, [%x[vptr0], v_col_stride2]\n"
- "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
- "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
- "fmla vV23B.4s, vU33B.4s, vW21B.4s\n"
- "fmul vV31B.4s, vU33B.4s, vW13B.4s\n"
- "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
- "fmla vV32B.4s, vU33B.4s, vW12B.4s\n"
- "fmla vV33B.4s, vU33B.4s, vW11B.4s\n"
- "ldr qU22B, [uptr1, u_col_stride1]\n"
- "fmla vV23B.4s, vU45B.4s, vW33B.4s\n"
- "fmla vV33B.4s, vU45B.4s, vW23B.4s\n"
- "ldr qU32B, [uptr2, u_col_stride1]\n"
- "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
- "fmla vV23B.4s, vU44B.4s, vW32B.4s\n"
- "fmla vV32B.4s, vU44B.4s, vW23B.4s\n"
- "fmla vV33B.4s, vU44B.4s, vW22B.4s\n"
- "ldr qU42B, [uptr3, u_col_stride1]\n"
- "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
- "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
- "fmla vV23B.4s, vU43B.4s, vW31B.4s\n"
- "str qV23B, [vptr1, v_col_stride2]\n"
- "fmla vV31B.4s, vU43B.4s, vW23B.4s\n"
- "ldr qW23A, [wptr1, w_col_stride2]\n"
- "fmla vV32B.4s, vU43B.4s, vW22B.4s\n"
- "fmla vV33B.4s, vU43B.4s, vW21B.4s\n"
- "ldr qU52B, [uptr4, u_col_stride1]\n"
- "fmla vV33B.4s, vU55B.4s, vW33B.4s\n"
- "ldr qU11B, [%x[uptr0]], #0x10\n"
- "fmla vV32B.4s, vU54B.4s, vW33B.4s\n"
- "fmla vV33B.4s, vU54B.4s, vW32B.4s\n"
- "ldr qU21B, [uptr1], #0x10\n"
- "fmla vV31B.4s, vU53B.4s, vW33B.4s\n"
- "ldr qW33A, [wptr2, w_col_stride2]\n"
- "fmla vV32B.4s, vU53B.4s, vW32B.4s\n"
- "fmla vV33B.4s, vU53B.4s, vW31B.4s\n"
- "str qV33B, [vptr2, v_col_stride2]\n"
- "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
- "ldr qU31B, [uptr2], #0x10\n"
- "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
- "ldr qU41B, [uptr3], #0x10\n"
- "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
- "ldr qU51B, [uptr4], #0x10\n"
- "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
- "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
- "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
- "ldr qU15A, [%x[uptr0], u_col_stride4]\n"
- "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
- "ldr qW22A, [wptr1, w_col_stride1]\n"
- "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
- "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
- "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
- "str qV12B, [%x[vptr0], v_col_stride1]\n"
- "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
- "ldr qW32A, [wptr2, w_col_stride1]\n"
- "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
- "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
- "fmla vV31B.4s, vU32B.4s, vW12B.4s\n"
- "ldr qU25A, [uptr1, u_col_stride4]\n"
- "fmla vV32B.4s, vU32B.4s, vW11B.4s\n"
- "ldr qU24A, [uptr1, u_col_stride3]\n"
- "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
- "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
- "str qV22B, [vptr1, v_col_stride1]\n"
- "fmla vV31B.4s, vU42B.4s, vW22B.4s\n"
- "fmla vV32B.4s, vU42B.4s, vW21B.4s\n"
- "fmla vV31B.4s, vU52B.4s, vW32B.4s\n"
- "subs %x[n_iters], %x[n_iters], #1\n"
- "fmla vV32B.4s, vU52B.4s, vW31B.4s\n"
- "str qV32B, [vptr2, v_col_stride1]\n"
- "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
- "ldr qW11A, [%x[wptr0]], #0x10\n"
- "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
- "ldr qU23A, [uptr1, u_col_stride2]\n"
- "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
- "ldr qW21A, [wptr1], #0x10\n"
- "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
- "str qV11B, [%x[vptr0]], #0x10\n"
- "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
- "ldr qW31A, [wptr2], #0x10\n"
- "fmla vV31B.4s, vU31B.4s, vW11B.4s\n"
- "ldr qU34A, [uptr2, u_col_stride3]\n"
- "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
- "str qV21B, [vptr1], #0x10\n"
- "fmla vV31B.4s, vU41B.4s, vW21B.4s\n"
- "ldr qU35A, [uptr2, u_col_stride4]\n"
- "fmla vV31B.4s, vU51B.4s, vW31B.4s\n"
- "str qV31B, [vptr2], #0x10\n"
-
- // First part of A
- "fmul vV13A.4s, vU15A.4s, vW13A.4s\n"
- "ldr qU33A, [uptr2, u_col_stride2]\n"
- "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
- "bne 1b\n" // Loop
-
- "2:" // Tail dispatch
- "cbnz %w[odd_tail], 3f\n"
-
- // Even tail
- // A Part
- "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
- "ldr qU45A, [uptr3, u_col_stride4]\n"
- "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
- "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
- "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
- "ldr qU44A, [uptr3, u_col_stride3]\n"
- "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
- "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
- "ldr qU43A, [uptr3, u_col_stride2]\n"
- "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
- "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
- "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
- "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
- "ldr qU55A, [uptr4, u_col_stride4]\n"
- "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
- "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
- "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
- "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
- "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
- "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
- "ldr qU54A, [uptr4, u_col_stride3]\n"
- "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
- "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
- "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
- "ldr qU53A, [uptr4, u_col_stride2]\n"
- "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
- "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
- "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
- "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
- "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
- "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
- "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
- "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
- "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
- "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
- "str qV13A, [%x[vptr0], v_col_stride2]\n"
- "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
- "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
- "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
- "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
- "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
- "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
- "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
- "ldr qU22A, [uptr1, u_col_stride1]\n"
- "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
- "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
- "ldr qU32A, [uptr2, u_col_stride1]\n"
- "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
- "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
- "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
- "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
- "ldr qU42A, [uptr3, u_col_stride1]\n"
- "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
- "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
- "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
- "str qV23A, [vptr1, v_col_stride2]\n"
- "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
- "ldr qW23B, [wptr1, w_col_stride2]\n"
- "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
- "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
- "ldr qU52A, [uptr4, u_col_stride1]\n"
- "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
- "ldr qU11A, [%x[uptr0]], #0x10\n"
- "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
- "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
- "ldr qU21A, [uptr1], #0x10\n"
- "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
- "ldr qW33B, [wptr2, w_col_stride2]\n"
- "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
- "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
- "str qV33A, [vptr2, v_col_stride2]\n"
- "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
- "ldr qU31A, [uptr2], #0x10\n"
- "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
- "ldr qU41A, [uptr3], #0x10\n"
- "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
- "ldr qU51A, [uptr4], #0x10\n"
- "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
- "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
- "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
- "ldr qU15B, [%x[uptr0], u_col_stride4]\n"
- "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
- "ldr qW22B, [wptr1, w_col_stride1]\n"
- "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
- "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
- "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
- "str qV12A, [%x[vptr0], v_col_stride1]\n"
- "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
- "ldr qW32B, [wptr2, w_col_stride1]\n"
- "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
- "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
- "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
- "ldr qU25B, [uptr1, u_col_stride4]\n"
- "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
- "ldr qU24B, [uptr1, u_col_stride3]\n"
- "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
- "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
- "str qV22A, [vptr1, v_col_stride1]\n"
- "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
- "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
- "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
- "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
- "str qV32A, [vptr2, v_col_stride1]\n"
- "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
- "ldr qW11B, [%x[wptr0]], #0x10\n"
- "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
- "ldr qU23B, [uptr1, u_col_stride2]\n"
- "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
- "ldr qW21B, [wptr1], #0x10\n"
- "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
- "str qV11A, [%x[vptr0]], #0x10\n"
- "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
- "ldr qW31B, [wptr2], #0x10\n"
- "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
- "ldr qU34B, [uptr2, u_col_stride3]\n"
- "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
- "str qV21A, [vptr1], #0x10\n"
- "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
- "ldr qU35B, [uptr2, u_col_stride4]\n"
- "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
- "str qV31A, [vptr2], #0x10\n"
-
- // B Part
- "fmul vV13B.4s, vU15B.4s, vW13B.4s\n"
- "ldr qU33B, [uptr2, u_col_stride2]\n"
- "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
- "fmla vV13B.4s, vU14B.4s, vW12B.4s\n"
- "ldr qU45B, [uptr3, u_col_stride4]\n"
- "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
- "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
- "fmla vV13B.4s, vU13B.4s, vW11B.4s\n"
- "ldr qU44B, [uptr3, u_col_stride3]\n"
- "fmla vV13B.4s, vU25B.4s, vW23B.4s\n"
- "fmul vV23B.4s, vU25B.4s, vW13B.4s\n"
- "ldr qU43B, [uptr3, u_col_stride2]\n"
- "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
- "fmla vV13B.4s, vU24B.4s, vW22B.4s\n"
- "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
- "fmla vV23B.4s, vU24B.4s, vW12B.4s\n"
- "ldr qU55B, [uptr4, u_col_stride4]\n"
- "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
- "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
- "fmla vV13B.4s, vU23B.4s, vW21B.4s\n"
- "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
- "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
- "fmla vV23B.4s, vU23B.4s, vW11B.4s\n"
- "ldr qU54B, [uptr4, u_col_stride3]\n"
- "fmla vV13B.4s, vU35B.4s, vW33B.4s\n"
- "fmla vV23B.4s, vU35B.4s, vW23B.4s\n"
- "fmul vV33B.4s, vU35B.4s, vW13B.4s\n"
- "ldr qU53B, [uptr4, u_col_stride2]\n"
- "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
- "fmla vV13B.4s, vU34B.4s, vW32B.4s\n"
- "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
- "fmla vV23B.4s, vU34B.4s, vW22B.4s\n"
- "fmul vV32B.4s, vU34B.4s, vW13B.4s\n"
- "fmla vV33B.4s, vU34B.4s, vW12B.4s\n"
- "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
- "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
- "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
- "fmla vV13B.4s, vU33B.4s, vW31B.4s\n"
- "str qV13B, [%x[vptr0], v_col_stride2]\n"
- "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
- "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
- "fmla vV23B.4s, vU33B.4s, vW21B.4s\n"
- "fmul vV31B.4s, vU33B.4s, vW13B.4s\n"
- "fmla vV32B.4s, vU33B.4s, vW12B.4s\n"
- "fmla vV33B.4s, vU33B.4s, vW11B.4s\n"
- "ldr qU22B, [uptr1, u_col_stride1]\n"
- "fmla vV23B.4s, vU45B.4s, vW33B.4s\n"
- "fmla vV33B.4s, vU45B.4s, vW23B.4s\n"
- "ldr qU32B, [uptr2, u_col_stride1]\n"
- "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
- "fmla vV23B.4s, vU44B.4s, vW32B.4s\n"
- "fmla vV32B.4s, vU44B.4s, vW23B.4s\n"
- "fmla vV33B.4s, vU44B.4s, vW22B.4s\n"
- "ldr qU42B, [uptr3, u_col_stride1]\n"
- "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
- "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
- "fmla vV23B.4s, vU43B.4s, vW31B.4s\n"
- "str qV23B, [vptr1, v_col_stride2]\n"
- "fmla vV31B.4s, vU43B.4s, vW23B.4s\n"
- "fmla vV32B.4s, vU43B.4s, vW22B.4s\n"
- "fmla vV33B.4s, vU43B.4s, vW21B.4s\n"
- "ldr qU52B, [uptr4, u_col_stride1]\n"
- "fmla vV33B.4s, vU55B.4s, vW33B.4s\n"
- "ldr qU11B, [%x[uptr0]], #0x10\n"
- "fmla vV32B.4s, vU54B.4s, vW33B.4s\n"
- "fmla vV33B.4s, vU54B.4s, vW32B.4s\n"
- "ldr qU21B, [uptr1], #0x10\n"
- "fmla vV31B.4s, vU53B.4s, vW33B.4s\n"
- "fmla vV32B.4s, vU53B.4s, vW32B.4s\n"
- "fmla vV33B.4s, vU53B.4s, vW31B.4s\n"
- "str qV33B, [vptr2, v_col_stride2]\n"
- "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
- "ldr qU31B, [uptr2], #0x10\n"
- "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
- "ldr qU41B, [uptr3], #0x10\n"
- "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
- "ldr qU51B, [uptr4], #0x10\n"
- "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
- "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
- "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
- "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
- "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
- "str qV12B, [%x[vptr0], v_col_stride1]\n"
- "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
- "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
- "fmla vV31B.4s, vU32B.4s, vW12B.4s\n"
- "fmla vV32B.4s, vU32B.4s, vW11B.4s\n"
- "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
- "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
- "str qV22B, [vptr1, v_col_stride1]\n"
- "fmla vV31B.4s, vU42B.4s, vW22B.4s\n"
- "fmla vV32B.4s, vU42B.4s, vW21B.4s\n"
- "fmla vV31B.4s, vU52B.4s, vW32B.4s\n"
- "subs %x[n_iters], %x[n_iters], #1\n"
- "fmla vV32B.4s, vU52B.4s, vW31B.4s\n"
- "str qV32B, [vptr2, v_col_stride1]\n"
- "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
- "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
- "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
- "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
- "str qV11B, [%x[vptr0]], #0x10\n"
- "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
- "fmla vV31B.4s, vU31B.4s, vW11B.4s\n"
- "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
- "str qV21B, [vptr1], #0x10\n"
- "fmla vV31B.4s, vU41B.4s, vW21B.4s\n"
- "fmla vV31B.4s, vU51B.4s, vW31B.4s\n"
- "str qV31B, [vptr2], #0x10\n"
-
- "b 4f\n" // Branch to end of method
-
- "3:" // Odd tail, finish off A
- "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
- "ldr qU45A, [uptr3, u_col_stride4]\n"
- "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
- "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
- "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
- "ldr qU44A, [uptr3, u_col_stride3]\n"
- "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
- "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
- "ldr qU43A, [uptr3, u_col_stride2]\n"
- "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
- "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
- "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
- "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
- "ldr qU55A, [uptr4, u_col_stride4]\n"
- "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
- "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
- "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
- "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
- "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
- "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
- "ldr qU54A, [uptr4, u_col_stride3]\n"
- "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
- "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
- "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
- "ldr qU53A, [uptr4, u_col_stride2]\n"
- "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
- "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
- "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
- "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
- "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
- "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
- "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
- "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
- "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
- "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
- "str qV13A, [%x[vptr0], v_col_stride2]\n"
- "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
- "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
- "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
- "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
- "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
- "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
- "ldr qU22A, [uptr1, u_col_stride1]\n"
- "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
- "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
- "ldr qU32A, [uptr2, u_col_stride1]\n"
- "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
- "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
- "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
- "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
- "ldr qU42A, [uptr3, u_col_stride1]\n"
- "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
- "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
- "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
- "str qV23A, [vptr1, v_col_stride2]\n"
- "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
- "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
- "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
- "ldr qU52A, [uptr4, u_col_stride1]\n"
- "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
- "ldr qU11A, [%x[uptr0]], #0x10\n"
- "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
- "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
- "ldr qU21A, [uptr1], #0x10\n"
- "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
- "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
- "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
- "str qV33A, [vptr2, v_col_stride2]\n"
- "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
- "ldr qU31A, [uptr2], #0x10\n"
- "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
- "ldr qU41A, [uptr3], #0x10\n"
- "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
- "ldr qU51A, [uptr4], #0x10\n"
- "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
- "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
- "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
- "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
- "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
- "str qV12A, [%x[vptr0], v_col_stride1]\n"
- "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
- "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
- "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
- "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
- "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
- "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
- "str qV22A, [vptr1, v_col_stride1]\n"
- "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
- "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
- "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
- "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
- "str qV32A, [vptr2, v_col_stride1]\n"
- "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
- "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
- "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
- "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
- "str qV11A, [%x[vptr0]], #0x10\n"
- "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
- "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
- "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
- "str qV21A, [vptr1], #0x10\n"
- "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
- "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
- "str qV31A, [vptr2], #0x10\n"
-
- "4:" // End of method
- ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n"
- ".unreq u_col_stride1\n" ".unreq u_col_stride2\n"
- ".unreq u_col_stride3\n" ".unreq u_col_stride4\n"
- ".unreq wptr1\n" ".unreq wptr2\n"
- ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
- ".unreq vptr1\n" ".unreq vptr2\n"
- ".unreq v_col_stride1\n" ".unreq v_col_stride2\n"
-
- ".unreq qU22B\n" ".unreq qW13B\n" ".unreq qW13A\n" ".unreq qU51B\n"
- ".unreq qU54B\n" ".unreq qU45A\n" ".unreq qU15A\n" ".unreq qU41B\n"
- ".unreq qU24B\n" ".unreq qU21A\n"
- ".unreq qV11B\n" ".unreq qU51A\n" ".unreq qU35A\n" ".unreq qU12A\n"
- ".unreq qU42B\n" ".unreq qU44B\n" ".unreq qU13B\n" ".unreq qW33A\n"
- ".unreq qV31B\n" ".unreq qV23A\n" ".unreq qU31A\n" ".unreq qU35B\n" ".unreq qU13A\n"
- ".unreq qV23B\n" ".unreq qU11A\n" ".unreq qU25A\n" ".unreq qU43A\n" ".unreq qU52B\n"
- ".unreq qU24A\n" ".unreq qU23B\n" ".unreq qV21A\n" ".unreq qV32B\n"
- ".unreq qV33B\n" ".unreq qW11A\n" ".unreq qU31B\n"
- ".unreq qW12B\n" ".unreq qU33A\n" ".unreq qU14A\n" ".unreq qU22A\n"
- ".unreq qU25B\n" ".unreq qU53B\n" ".unreq qU42A\n" ".unreq qU44A\n"
- ".unreq qU43B\n" ".unreq qW31A\n" ".unreq qU11B\n"
- ".unreq qW11B\n" ".unreq qW32A\n"
- ".unreq qU12B\n" ".unreq qU34B\n" ".unreq qW21A\n"
- ".unreq qU14B\n" ".unreq qV21B\n" ".unreq qW22A\n"
- ".unreq qW23B\n" ".unreq qW23A\n" ".unreq qU21B\n"
- ".unreq qU32B\n" ".unreq qU34A\n" ".unreq qU45B\n" ".unreq qV31A\n"
- ".unreq qW12A\n" ".unreq qU33B\n" ".unreq qU15B\n"
- ".unreq qW33B\n" ".unreq qU54A\n" ".unreq qU23A\n"
- ".unreq qW32B\n" ".unreq qV33A\n" ".unreq qW31B\n" ".unreq qV12A\n"
- ".unreq qV12B\n" ".unreq qU41A\n" ".unreq qU53A\n"
- ".unreq qV13A\n" ".unreq qU32A\n" ".unreq qW22B\n"
- ".unreq qV22B\n" ".unreq qU52A\n" ".unreq qV13B\n" ".unreq qV32A\n"
- ".unreq qU55A\n" ".unreq qU55B\n" ".unreq qV22A\n" ".unreq qW21B\n"
- ".unreq qV11A\n"
- ".unreq vU22B\n" ".unreq vW13B\n" ".unreq vW13A\n" ".unreq vU51B\n"
- ".unreq vU54B\n" ".unreq vU45A\n" ".unreq vU15A\n" ".unreq vU41B\n"
- ".unreq vU24B\n" ".unreq vU21A\n"
- ".unreq vV11B\n" ".unreq vU51A\n" ".unreq vU35A\n" ".unreq vU12A\n"
- ".unreq vU42B\n" ".unreq vU44B\n" ".unreq vU13B\n" ".unreq vW33A\n"
- ".unreq vV31B\n" ".unreq vV23A\n" ".unreq vU31A\n" ".unreq vU35B\n" ".unreq vU13A\n"
- ".unreq vV23B\n" ".unreq vU11A\n" ".unreq vU25A\n" ".unreq vU43A\n" ".unreq vU52B\n"
- ".unreq vU24A\n" ".unreq vU23B\n" ".unreq vV21A\n" ".unreq vV32B\n"
- ".unreq vV33B\n" ".unreq vW11A\n" ".unreq vU31B\n"
- ".unreq vW12B\n" ".unreq vU33A\n" ".unreq vU14A\n" ".unreq vU22A\n"
- ".unreq vU25B\n" ".unreq vU53B\n" ".unreq vU42A\n" ".unreq vU44A\n"
- ".unreq vU43B\n" ".unreq vW31A\n" ".unreq vU11B\n"
- ".unreq vW11B\n" ".unreq vW32A\n"
- ".unreq vU12B\n" ".unreq vU34B\n" ".unreq vW21A\n"
- ".unreq vU14B\n" ".unreq vV21B\n" ".unreq vW22A\n"
- ".unreq vW23B\n" ".unreq vW23A\n" ".unreq vU21B\n"
- ".unreq vU32B\n" ".unreq vU34A\n" ".unreq vU45B\n" ".unreq vV31A\n"
- ".unreq vW12A\n" ".unreq vU33B\n" ".unreq vU15B\n"
- ".unreq vW33B\n" ".unreq vU54A\n" ".unreq vU23A\n"
- ".unreq vW32B\n" ".unreq vV33A\n" ".unreq vW31B\n" ".unreq vV12A\n"
- ".unreq vV12B\n" ".unreq vU41A\n" ".unreq vU53A\n"
- ".unreq vV13A\n" ".unreq vU32A\n" ".unreq vW22B\n"
- ".unreq vV22B\n" ".unreq vU52A\n" ".unreq vV13B\n" ".unreq vV32A\n"
- ".unreq vU55A\n" ".unreq vU55B\n" ".unreq vV22A\n" ".unreq vW21B\n"
- ".unreq vV11A\n"
- : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0),
- [n_iters] "+r" (n_iters)
- : [u_row_stride] "r" (in_row_stride * sizeof(float)),
- [u_col_stride] "r" (in_col_stride * sizeof(float)),
- [w_row_stride] "r" (weight_row_stride * sizeof(float)),
- [w_col_stride] "r" (weight_col_stride * sizeof(float)),
- [v_row_stride] "r" (out_row_stride * sizeof(float)),
- [v_col_stride] "r" (out_col_stride * sizeof(float)),
- [odd_tail] "r" (odd_tail)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
- "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
- "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0",
- "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
- "x12", "cc", "memory"
- );
- }
- if (channels_remaining)
- {
- // Fall back on the unoptimised version to clean up the tail
- ConvImpl::process_tile<false>(
- channels_remaining,
- wptr0, weight_row_stride, weight_col_stride,
- uptr0, in_row_stride, in_col_stride,
- vptr0, out_row_stride, out_col_stride,
- 0, 0, 0, 0, 0, 0
- );
- }
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
+)
+{
+ __asm __volatile(
+ "add x17, %[inptr0], %[input_row_stride]\n"
+ "add x18, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x25, %[outptr0], %[output_row_stride]\n"
+ "add x14, x17, %[input_row_stride]\n"
+ "add x22, x18, #64\n"
+ "add x15, x18, %[input_col_stride1]\n"
+ "add x21, x14, %[input_row_stride]\n"
+ "add x16, x15, #64\n"
+ "add x24, x15, %[input_col_stride1]\n"
+ "add x26, x21, %[input_row_stride]\n"
+ "add x23, x24, #64\n"
+ "add x13, x25, %[output_row_stride]\n"
+ "add x27, %[output_col_stride1], %[output_col_stride1]\n"
+ "and x19, %[n_channels], #3\n"
+ "lsr x20, %[n_channels], #2\n"
+ "cbz x20, 4f\n"
+ "1:\n"
+ "ldr q19, [%[wbptr]]\n"
+ "subs x20, x20, #1\n"
+ "mov v8.16b, v19.16b\n"
+ "ldr q17, [%[wbptr], #16]\n"
+ "mov v5.16b, v19.16b\n"
+ "ldr q16, [%[wbptr], #32]\n"
+ "mov v7.16b, v19.16b\n"
+ "ldr q15, [%[wbptr], #48]\n"
+ "mov v2.16b, v19.16b\n"
+ "ldr q14, [%[wbptr], #64]\n"
+ "mov v4.16b, v19.16b\n"
+ "ldr q13, [%[wbptr], #80]\n"
+ "mov v6.16b, v19.16b\n"
+ "ldr q12, [%[wbptr], #96]\n"
+ "mov v1.16b, v19.16b\n"
+ "ldr q11, [%[wbptr], #112]\n"
+ "mov v3.16b, v19.16b\n"
+ "ldr q10, [%[wbptr], #128]\n"
+ "mov v0.16b, v19.16b\n"
+ "ldr q9, [%[wbptr], #144]\n"
+ "ldr q25, [%[inptr0]]\n"
+ "ldr q27, [x17]\n"
+ "fmla v8.4s, v25.4s, v17.4s\n"
+ "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr q20, [x14]\n"
+ "ldr q22, [x17, %[input_col_stride1]]\n"
+ "ldr q28, [%[inptr0], x18]\n"
+ "ldr q23, [x21]\n"
+ "fmla v8.4s, v27.4s, v14.4s\n"
+ "ldr q18, [x14, %[input_col_stride1]]\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x17, #64]\n"
+ "prfm pldl1keep, [%[inptr0], x28]\n"
+ "prfm pldl1keep, [x14, #64]\n"
+ "prfm pldl1keep, [x17, x28]\n"
+ "prfm pldl1keep, [%[inptr0], x22]\n"
+ "prfm pldl1keep, [x21, #64]\n"
+ "prfm pldl1keep, [x14, x28]\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v5.4s, v27.4s, v17.4s\n"
+ "ldr q27, [x17, x18]\n"
+ "fmla v8.4s, v26.4s, v16.4s\n"
+ "ldr q30, [%[inptr0], x15]\n"
+ "fmla v7.4s, v26.4s, v17.4s\n"
+ "ldr q31, [x26]\n"
+ "fmla v5.4s, v20.4s, v14.4s\n"
+ "ldr q24, [x21, %[input_col_stride1]]\n"
+ "fmla v8.4s, v20.4s, v11.4s\n"
+ "prfm pldl1keep, [x17, x22]\n"
+ "fmla v2.4s, v20.4s, v17.4s\n"
+ "ldr q29, [x14, x18]\n"
+ "fmla v5.4s, v22.4s, v16.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v8.4s, v22.4s, v13.4s\n"
+ "prfm pldl1keep, [x26, #64]\n"
+ "fmla v7.4s, v22.4s, v14.4s\n"
+ "prfm pldl1keep, [x21, x28]\n"
+ "fmla v4.4s, v22.4s, v17.4s\n"
+ "ldr q21, [x17, x15]\n"
+ "fmla v8.4s, v28.4s, v15.4s\n"
+ "prfm pldl1keep, [x14, x22]\n"
+ "fmla v7.4s, v28.4s, v16.4s\n"
+ "prfm pldl1keep, [x17, x16]\n"
+ "fmla v6.4s, v28.4s, v17.4s\n"
+ "ldr q19, [%[inptr0], x24]\n"
+ "fmla v5.4s, v23.4s, v11.4s\n"
+ "prfm pldl1keep, [%[inptr0], x23]\n"
+ "fmla v2.4s, v23.4s, v14.4s\n"
+ "ldr q28, [x26, %[input_col_stride1]]\n"
+ "fmla v8.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x26, x28]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "prfm pldl1keep, [x21, x22]\n"
+ "fmla v7.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x14, x16]\n"
+ "fmla v2.4s, v18.4s, v16.4s\n"
+ "prfm pldl1keep, [x17, x23]\n"
+ "fmla v4.4s, v18.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x22]\n"
+ "fmla v1.4s, v18.4s, v17.4s\n"
+ "ldr q25, [x21, x18]\n"
+ "fmla v8.4s, v27.4s, v12.4s\n"
+ "prfm pldl1keep, [x21, x16]\n"
+ "fmla v5.4s, v27.4s, v15.4s\n"
+ "prfm pldl1keep, [x14, x23]\n"
+ "fmla v7.4s, v27.4s, v13.4s\n"
+ "prfm pldl1keep, [x26, x16]\n"
+ "fmla v4.4s, v27.4s, v16.4s\n"
+ "prfm pldl1keep, [x21, x23]\n"
+ "fmla v6.4s, v27.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x23]\n"
+ "fmla v3.4s, v27.4s, v17.4s\n"
+ "ldr q27, [x14, x15]\n"
+ "fmla v7.4s, v30.4s, v15.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v6.4s, v30.4s, v16.4s\n"
+ "ldr q26, [x17, x24]\n"
+ "fmla v2.4s, v31.4s, v11.4s\n"
+ "ldr q20, [x26, x18]\n"
+ "fmla v5.4s, v24.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v4.4s, v24.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v2.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v1.4s, v24.4s, v14.4s\n"
+ "ldr q18, [x21, x15]\n"
+ "fmla v8.4s, v29.4s, v9.4s\n"
+ "prfm pldl1keep, [%[inptr0], x28]\n"
+ "fmla v5.4s, v29.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], x22]\n"
+ "fmla v7.4s, v29.4s, v10.4s\n"
+ "add x17, x17, #16\n"
+ "fmla v2.4s, v29.4s, v15.4s\n"
+ "prfm pldl1keep, [x17, #64]\n"
+ "fmla v4.4s, v29.4s, v13.4s\n"
+ "prfm pldl1keep, [x17, x28]\n"
+ "fmla v6.4s, v29.4s, v11.4s\n"
+ "subs x20, x20, #1\n"
+ "fmla v1.4s, v29.4s, v16.4s\n"
+ "fmla v3.4s, v29.4s, v14.4s\n"
+ "fmla v0.4s, v29.4s, v17.4s\n"
+ "ldr q22, [x14, x24]\n"
+ "fmla v7.4s, v21.4s, v12.4s\n"
+ "ldr q23, [x26, x15]\n"
+ "fmla v4.4s, v21.4s, v15.4s\n"
+ "add x14, x14, #16\n"
+ "fmla v6.4s, v21.4s, v13.4s\n"
+ "prfm pldl1keep, [x14, #64]\n"
+ "fmla v3.4s, v21.4s, v16.4s\n"
+ "ldr q24, [x21, x24]\n"
+ "fmla v2.4s, v28.4s, v10.4s\n"
+ "prfm pldl1keep, [x14, x28]\n"
+ "fmla v6.4s, v19.4s, v15.4s\n"
+ "ldr q21, [x26, x24]\n"
+ "fmla v1.4s, v28.4s, v11.4s\n"
+ "ldr q19, [%[wbptr]]\n"
+ "fmla v5.4s, v25.4s, v9.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v2.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x21, #64]\n"
+ "fmla v4.4s, v25.4s, v10.4s\n"
+ "add x26, x26, #16\n"
+ "fmla v1.4s, v25.4s, v13.4s\n"
+ "fmla v3.4s, v25.4s, v11.4s\n"
+ "fmla v0.4s, v25.4s, v14.4s\n"
+ "ldr q17, [%[wbptr], #16]\n"
+ "fmla v7.4s, v27.4s, v9.4s\n"
+ "ldr q25, [%[inptr0]]\n"
+ "fmla v4.4s, v27.4s, v12.4s\n"
+ "fmla v6.4s, v27.4s, v10.4s\n"
+ "fmla v1.4s, v27.4s, v15.4s\n"
+ "fmla v3.4s, v27.4s, v13.4s\n"
+ "fmla v0.4s, v27.4s, v16.4s\n"
+ "ldr q14, [%[wbptr], #64]\n"
+ "fmla v6.4s, v26.4s, v12.4s\n"
+ "ldr q27, [x17]\n"
+ "fmla v3.4s, v26.4s, v15.4s\n"
+ "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v20.4s, v9.4s\n"
+ "fmla v1.4s, v20.4s, v10.4s\n"
+ "fmla v0.4s, v20.4s, v11.4s\n"
+ "ldr q16, [%[wbptr], #32]\n"
+ "fmla v4.4s, v18.4s, v9.4s\n"
+ "ldr q20, [x14]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "fmla v3.4s, v18.4s, v10.4s\n"
+ "fmla v0.4s, v18.4s, v13.4s\n"
+ "ldr q11, [%[wbptr], #112]\n"
+ "fmla v6.4s, v22.4s, v9.4s\n"
+ "movi v30.16b, #0\n"
+ "fmla v3.4s, v22.4s, v12.4s\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "fmla v0.4s, v22.4s, v15.4s\n"
+ "ldr q13, [%[wbptr], #80]\n"
+ "fmov v29.4s, #6.0\n"
+ "fmax v8.4s, v8.4s, v30.4s\n"
+ "fmla v3.4s, v24.4s, v9.4s\n"
+ "fmax v7.4s, v7.4s, v30.4s\n"
+ "fmla v0.4s, v23.4s, v10.4s\n"
+ "ldr q15, [%[wbptr], #48]\n"
+ "fmin v8.4s, v8.4s, v29.4s\n"
+ "ldr q22, [x17, %[input_col_stride1]]\n"
+ "fmin v7.4s, v7.4s, v29.4s\n"
+ "fmax v6.4s, v6.4s, v30.4s\n"
+ "str q8, [%[outptr0]]\n"
+ "fmla v0.4s, v24.4s, v12.4s\n"
+ "str q7, [%[outptr0], %[output_col_stride1]]\n"
+ "fmin v6.4s, v6.4s, v29.4s\n"
+ "fmax v5.4s, v5.4s, v30.4s\n"
+ "ldr q10, [%[wbptr], #128]\n"
+ "str q6, [%[outptr0], x27]\n"
+ "fmla v0.4s, v21.4s, v9.4s\n"
+ "fmin v5.4s, v5.4s, v29.4s\n"
+ "ldr q12, [%[wbptr], #96]\n"
+ "fmax v4.4s, v4.4s, v30.4s\n"
+ "ldr q28, [%[inptr0], x18]\n"
+ "str q5, [x25]\n"
+ "fmax v3.4s, v3.4s, v30.4s\n"
+ "fmin v4.4s, v4.4s, v29.4s\n"
+ "ldr q9, [%[wbptr], #144]\n"
+ "fmin v3.4s, v3.4s, v29.4s\n"
+ "ldr q23, [x21]\n"
+ "str q4, [x25, %[output_col_stride1]]\n"
+ "fmax v2.4s, v2.4s, v30.4s\n"
+ "str q3, [x25, x27]\n"
+ "fmax v1.4s, v1.4s, v30.4s\n"
+ "fmin v2.4s, v2.4s, v29.4s\n"
+ "ldr q18, [x14, %[input_col_stride1]]\n"
+ "fmin v1.4s, v1.4s, v29.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "str q2, [x13]\n"
+ "fmax v0.4s, v0.4s, v30.4s\n"
+ "str q1, [x13, %[output_col_stride1]]\n"
+ "mov v8.16b, v19.16b\n"
+ "fmin v0.4s, v0.4s, v29.4s\n"
+ "add x25, x25, #16\n"
+ "mov v5.16b, v19.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "str q0, [x13, x27]\n"
+ "mov v2.16b, v19.16b\n"
+ "mov v4.16b, v19.16b\n"
+ "add x13, x13, #16\n"
+ "mov v6.16b, v19.16b\n"
+ "mov v1.16b, v19.16b\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v0.16b, v19.16b\n"
+ "fmla v8.4s, v25.4s, v17.4s\n"
+ "fmla v8.4s, v27.4s, v14.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v5.4s, v27.4s, v17.4s\n"
+ "ldr q27, [x17, x18]\n"
+ "fmla v8.4s, v26.4s, v16.4s\n"
+ "ldr q30, [%[inptr0], x15]\n"
+ "fmla v7.4s, v26.4s, v17.4s\n"
+ "ldr q31, [x26]\n"
+ "fmla v5.4s, v20.4s, v14.4s\n"
+ "ldr q24, [x21, %[input_col_stride1]]\n"
+ "fmla v8.4s, v20.4s, v11.4s\n"
+ "prfm pldl1keep, [x17, x22]\n"
+ "fmla v2.4s, v20.4s, v17.4s\n"
+ "ldr q29, [x14, x18]\n"
+ "fmla v5.4s, v22.4s, v16.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v8.4s, v22.4s, v13.4s\n"
+ "prfm pldl1keep, [x26, #64]\n"
+ "fmla v7.4s, v22.4s, v14.4s\n"
+ "prfm pldl1keep, [x21, x28]\n"
+ "fmla v4.4s, v22.4s, v17.4s\n"
+ "ldr q21, [x17, x15]\n"
+ "fmla v8.4s, v28.4s, v15.4s\n"
+ "prfm pldl1keep, [x14, x22]\n"
+ "fmla v7.4s, v28.4s, v16.4s\n"
+ "prfm pldl1keep, [x17, x16]\n"
+ "fmla v6.4s, v28.4s, v17.4s\n"
+ "ldr q19, [%[inptr0], x24]\n"
+ "fmla v5.4s, v23.4s, v11.4s\n"
+ "prfm pldl1keep, [%[inptr0], x23]\n"
+ "fmla v2.4s, v23.4s, v14.4s\n"
+ "ldr q28, [x26, %[input_col_stride1]]\n"
+ "fmla v8.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x26, x28]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "prfm pldl1keep, [x21, x22]\n"
+ "fmla v7.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x14, x16]\n"
+ "fmla v2.4s, v18.4s, v16.4s\n"
+ "prfm pldl1keep, [x17, x23]\n"
+ "fmla v4.4s, v18.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x22]\n"
+ "fmla v1.4s, v18.4s, v17.4s\n"
+ "ldr q25, [x21, x18]\n"
+ "fmla v8.4s, v27.4s, v12.4s\n"
+ "prfm pldl1keep, [x21, x16]\n"
+ "fmla v5.4s, v27.4s, v15.4s\n"
+ "prfm pldl1keep, [x14, x23]\n"
+ "fmla v7.4s, v27.4s, v13.4s\n"
+ "prfm pldl1keep, [x26, x16]\n"
+ "fmla v4.4s, v27.4s, v16.4s\n"
+ "prfm pldl1keep, [x21, x23]\n"
+ "fmla v6.4s, v27.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x23]\n"
+ "fmla v3.4s, v27.4s, v17.4s\n"
+ "ldr q27, [x14, x15]\n"
+ "fmla v7.4s, v30.4s, v15.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v6.4s, v30.4s, v16.4s\n"
+ "ldr q26, [x17, x24]\n"
+ "fmla v2.4s, v31.4s, v11.4s\n"
+ "ldr q20, [x26, x18]\n"
+ "fmla v5.4s, v24.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v4.4s, v24.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v2.4s, v24.4s, v13.4s\n"
+ "add x17, x17, #16\n"
+ "fmla v1.4s, v24.4s, v14.4s\n"
+ "ldr q18, [x21, x15]\n"
+ "fmla v8.4s, v29.4s, v9.4s\n"
+ "fmla v5.4s, v29.4s, v12.4s\n"
+ "fmla v7.4s, v29.4s, v10.4s\n"
+ "fmla v2.4s, v29.4s, v15.4s\n"
+ "fmla v4.4s, v29.4s, v13.4s\n"
+ "fmla v6.4s, v29.4s, v11.4s\n"
+ "fmla v1.4s, v29.4s, v16.4s\n"
+ "fmla v3.4s, v29.4s, v14.4s\n"
+ "fmla v0.4s, v29.4s, v17.4s\n"
+ "ldr q22, [x14, x24]\n"
+ "fmla v7.4s, v21.4s, v12.4s\n"
+ "ldr q23, [x26, x15]\n"
+ "fmla v4.4s, v21.4s, v15.4s\n"
+ "add x14, x14, #16\n"
+ "fmla v6.4s, v21.4s, v13.4s\n"
+ "fmla v3.4s, v21.4s, v16.4s\n"
+ "fmla v2.4s, v28.4s, v10.4s\n"
+ "ldr q24, [x21, x24]\n"
+ "fmla v1.4s, v28.4s, v11.4s\n"
+ "ldr q21, [x26, x24]\n"
+ "fmla v6.4s, v19.4s, v15.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v5.4s, v25.4s, v9.4s\n"
+ "add x26, x26, #16\n"
+ "fmla v2.4s, v25.4s, v12.4s\n"
+ "fmla v4.4s, v25.4s, v10.4s\n"
+ "fmla v1.4s, v25.4s, v13.4s\n"
+ "fmla v3.4s, v25.4s, v11.4s\n"
+ "fmla v0.4s, v25.4s, v14.4s\n"
+ "fmla v7.4s, v27.4s, v9.4s\n"
+ "fmla v4.4s, v27.4s, v12.4s\n"
+ "fmla v6.4s, v27.4s, v10.4s\n"
+ "fmla v1.4s, v27.4s, v15.4s\n"
+ "fmla v3.4s, v27.4s, v13.4s\n"
+ "fmla v0.4s, v27.4s, v16.4s\n"
+ "fmla v2.4s, v20.4s, v9.4s\n"
+ "fmla v6.4s, v26.4s, v12.4s\n"
+ "fmla v4.4s, v18.4s, v9.4s\n"
+ "fmla v3.4s, v26.4s, v15.4s\n"
+ "fmla v1.4s, v20.4s, v10.4s\n"
+ "fmla v0.4s, v20.4s, v11.4s\n"
+ "movi v30.16b, #0\n"
+ "fmla v6.4s, v22.4s, v9.4s\n"
+ "fmov v29.4s, #6.0\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "fmla v3.4s, v18.4s, v10.4s\n"
+ "fmla v0.4s, v18.4s, v13.4s\n"
+ "fmax v8.4s, v8.4s, v30.4s\n"
+ "fmax v7.4s, v7.4s, v30.4s\n"
+ "fmax v6.4s, v6.4s, v30.4s\n"
+ "fmla v3.4s, v22.4s, v12.4s\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "fmla v0.4s, v22.4s, v15.4s\n"
+ "fmin v8.4s, v8.4s, v29.4s\n"
+ "fmin v7.4s, v7.4s, v29.4s\n"
+ "fmin v6.4s, v6.4s, v29.4s\n"
+ "str q8, [%[outptr0]]\n"
+ "fmla v3.4s, v24.4s, v9.4s\n"
+ "str q7, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v0.4s, v23.4s, v10.4s\n"
+ "str q6, [%[outptr0], x27]\n"
+ "fmax v5.4s, v5.4s, v30.4s\n"
+ "fmax v4.4s, v4.4s, v30.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v0.4s, v24.4s, v12.4s\n"
+ "fmin v5.4s, v5.4s, v29.4s\n"
+ "fmin v4.4s, v4.4s, v29.4s\n"
+ "fmax v3.4s, v3.4s, v30.4s\n"
+ "str q5, [x25]\n"
+ "fmax v2.4s, v2.4s, v30.4s\n"
+ "str q4, [x25, %[output_col_stride1]]\n"
+ "fmla v0.4s, v21.4s, v9.4s\n"
+ "fmin v3.4s, v3.4s, v29.4s\n"
+ "fmin v2.4s, v2.4s, v29.4s\n"
+ "fmax v1.4s, v1.4s, v30.4s\n"
+ "str q3, [x25, x27]\n"
+ "str q2, [x13]\n"
+ "fmin v1.4s, v1.4s, v29.4s\n"
+ "fmax v0.4s, v0.4s, v30.4s\n"
+ "add x25, x25, #16\n"
+ "str q1, [x13, %[output_col_stride1]]\n"
+ "fmin v0.4s, v0.4s, v29.4s\n"
+ "str q0, [x13, x27]\n"
+ "add x13, x13, #16\n"
+ "4:\n"
+ "cbz x19, 7f\n"
+ "ldr s19, [%[wbptr]]\n"
+ "mov v8.16b, v19.16b\n"
+ "ldr s17, [%[wbptr], #4]\n"
+ "mov v5.16b, v19.16b\n"
+ "ldr s16, [%[wbptr], #8]\n"
+ "mov v7.16b, v19.16b\n"
+ "ldr s15, [%[wbptr], #12]\n"
+ "mov v2.16b, v19.16b\n"
+ "ldr s14, [%[wbptr], #16]\n"
+ "mov v4.16b, v19.16b\n"
+ "ldr s13, [%[wbptr], #20]\n"
+ "mov v6.16b, v19.16b\n"
+ "ldr s12, [%[wbptr], #24]\n"
+ "mov v1.16b, v19.16b\n"
+ "ldr s11, [%[wbptr], #28]\n"
+ "mov v3.16b, v19.16b\n"
+ "ldr s10, [%[wbptr], #32]\n"
+ "mov v0.16b, v19.16b\n"
+ "ldr s9, [%[wbptr], #36]\n"
+ "ldr s25, [%[inptr0]]\n"
+ "subs x19, x19, #1\n"
+ "fmla v8.4s, v25.4s, v17.4s\n"
+ "ldr s27, [x17]\n"
+ "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr s20, [x14]\n"
+ "ldr s22, [x17, %[input_col_stride1]]\n"
+ "ldr s28, [%[inptr0], x18]\n"
+ "fmla v8.4s, v27.4s, v14.4s\n"
+ "ldr s23, [x21]\n"
+ "ldr s18, [x14, %[input_col_stride1]]\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x17, #64]\n"
+ "prfm pldl1keep, [%[inptr0], x28]\n"
+ "prfm pldl1keep, [x14, #64]\n"
+ "prfm pldl1keep, [x17, x28]\n"
+ "prfm pldl1keep, [%[inptr0], x22]\n"
+ "prfm pldl1keep, [x21, #64]\n"
+ "prfm pldl1keep, [x14, x28]\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v5.4s, v27.4s, v17.4s\n"
+ "ldr s27, [x17, x18]\n"
+ "fmla v8.4s, v26.4s, v16.4s\n"
+ "ldr s30, [%[inptr0], x15]\n"
+ "fmla v7.4s, v26.4s, v17.4s\n"
+ "ldr s31, [x26]\n"
+ "fmla v5.4s, v20.4s, v14.4s\n"
+ "ldr s24, [x21, %[input_col_stride1]]\n"
+ "fmla v8.4s, v20.4s, v11.4s\n"
+ "prfm pldl1keep, [x17, x22]\n"
+ "fmla v2.4s, v20.4s, v17.4s\n"
+ "ldr s29, [x14, x18]\n"
+ "fmla v5.4s, v22.4s, v16.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v8.4s, v22.4s, v13.4s\n"
+ "prfm pldl1keep, [x26, #64]\n"
+ "fmla v7.4s, v22.4s, v14.4s\n"
+ "prfm pldl1keep, [x21, x28]\n"
+ "fmla v4.4s, v22.4s, v17.4s\n"
+ "ldr s21, [x17, x15]\n"
+ "fmla v8.4s, v28.4s, v15.4s\n"
+ "prfm pldl1keep, [x14, x22]\n"
+ "fmla v7.4s, v28.4s, v16.4s\n"
+ "prfm pldl1keep, [x17, x16]\n"
+ "fmla v6.4s, v28.4s, v17.4s\n"
+ "ldr s19, [%[inptr0], x24]\n"
+ "fmla v5.4s, v23.4s, v11.4s\n"
+ "prfm pldl1keep, [%[inptr0], x23]\n"
+ "fmla v2.4s, v23.4s, v14.4s\n"
+ "ldr s28, [x26, %[input_col_stride1]]\n"
+ "fmla v8.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x26, x28]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "prfm pldl1keep, [x21, x22]\n"
+ "fmla v7.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x14, x16]\n"
+ "fmla v2.4s, v18.4s, v16.4s\n"
+ "prfm pldl1keep, [x17, x23]\n"
+ "fmla v4.4s, v18.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x22]\n"
+ "fmla v1.4s, v18.4s, v17.4s\n"
+ "ldr s25, [x21, x18]\n"
+ "fmla v8.4s, v27.4s, v12.4s\n"
+ "prfm pldl1keep, [x21, x16]\n"
+ "fmla v5.4s, v27.4s, v15.4s\n"
+ "prfm pldl1keep, [x14, x23]\n"
+ "fmla v7.4s, v27.4s, v13.4s\n"
+ "prfm pldl1keep, [x26, x16]\n"
+ "fmla v4.4s, v27.4s, v16.4s\n"
+ "prfm pldl1keep, [x21, x23]\n"
+ "fmla v6.4s, v27.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x23]\n"
+ "fmla v3.4s, v27.4s, v17.4s\n"
+ "ldr s27, [x14, x15]\n"
+ "fmla v7.4s, v30.4s, v15.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v6.4s, v30.4s, v16.4s\n"
+ "ldr s26, [x17, x24]\n"
+ "fmla v2.4s, v31.4s, v11.4s\n"
+ "ldr s20, [x26, x18]\n"
+ "fmla v5.4s, v24.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v4.4s, v24.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v2.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v1.4s, v24.4s, v14.4s\n"
+ "ldr s18, [x21, x15]\n"
+ "fmla v8.4s, v29.4s, v9.4s\n"
+ "prfm pldl1keep, [%[inptr0], x28]\n"
+ "fmla v5.4s, v29.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], x22]\n"
+ "fmla v7.4s, v29.4s, v10.4s\n"
+ "add x17, x17, #4\n"
+ "fmla v2.4s, v29.4s, v15.4s\n"
+ "prfm pldl1keep, [x17, #64]\n"
+ "fmla v4.4s, v29.4s, v13.4s\n"
+ "prfm pldl1keep, [x17, x28]\n"
+ "fmla v6.4s, v29.4s, v11.4s\n"
+ "subs x19, x19, #1\n"
+ "fmla v1.4s, v29.4s, v16.4s\n"
+ "fmla v3.4s, v29.4s, v14.4s\n"
+ "fmla v0.4s, v29.4s, v17.4s\n"
+ "ldr s22, [x14, x24]\n"
+ "fmla v7.4s, v21.4s, v12.4s\n"
+ "ldr s23, [x26, x15]\n"
+ "fmla v4.4s, v21.4s, v15.4s\n"
+ "add x14, x14, #4\n"
+ "fmla v6.4s, v21.4s, v13.4s\n"
+ "prfm pldl1keep, [x14, #64]\n"
+ "fmla v3.4s, v21.4s, v16.4s\n"
+ "ldr s24, [x21, x24]\n"
+ "fmla v2.4s, v28.4s, v10.4s\n"
+ "prfm pldl1keep, [x14, x28]\n"
+ "fmla v6.4s, v19.4s, v15.4s\n"
+ "ldr s21, [x26, x24]\n"
+ "fmla v1.4s, v28.4s, v11.4s\n"
+ "ldr s19, [%[wbptr]]\n"
+ "fmla v5.4s, v25.4s, v9.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v2.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x21, #64]\n"
+ "fmla v4.4s, v25.4s, v10.4s\n"
+ "add x26, x26, #4\n"
+ "fmla v1.4s, v25.4s, v13.4s\n"
+ "fmla v3.4s, v25.4s, v11.4s\n"
+ "fmla v0.4s, v25.4s, v14.4s\n"
+ "ldr s17, [%[wbptr], #4]\n"
+ "fmla v7.4s, v27.4s, v9.4s\n"
+ "ldr s25, [%[inptr0]]\n"
+ "fmla v4.4s, v27.4s, v12.4s\n"
+ "fmla v6.4s, v27.4s, v10.4s\n"
+ "fmla v1.4s, v27.4s, v15.4s\n"
+ "fmla v3.4s, v27.4s, v13.4s\n"
+ "fmla v0.4s, v27.4s, v16.4s\n"
+ "ldr s14, [%[wbptr], #16]\n"
+ "fmla v6.4s, v26.4s, v12.4s\n"
+ "ldr s27, [x17]\n"
+ "fmla v3.4s, v26.4s, v15.4s\n"
+ "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v2.4s, v20.4s, v9.4s\n"
+ "fmla v1.4s, v20.4s, v10.4s\n"
+ "fmla v0.4s, v20.4s, v11.4s\n"
+ "ldr s16, [%[wbptr], #8]\n"
+ "fmla v4.4s, v18.4s, v9.4s\n"
+ "ldr s20, [x14]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "fmla v3.4s, v18.4s, v10.4s\n"
+ "fmla v0.4s, v18.4s, v13.4s\n"
+ "ldr s11, [%[wbptr], #28]\n"
+ "fmla v6.4s, v22.4s, v9.4s\n"
+ "movi v30.16b, #0\n"
+ "fmla v3.4s, v22.4s, v12.4s\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "fmla v0.4s, v22.4s, v15.4s\n"
+ "ldr s13, [%[wbptr], #20]\n"
+ "fmov v29.4s, #6.0\n"
+ "fmax v8.4s, v8.4s, v30.4s\n"
+ "fmla v3.4s, v24.4s, v9.4s\n"
+ "fmax v7.4s, v7.4s, v30.4s\n"
+ "fmla v0.4s, v23.4s, v10.4s\n"
+ "ldr s15, [%[wbptr], #12]\n"
+ "fmin v8.4s, v8.4s, v29.4s\n"
+ "ldr s22, [x17, %[input_col_stride1]]\n"
+ "fmin v7.4s, v7.4s, v29.4s\n"
+ "fmax v6.4s, v6.4s, v30.4s\n"
+ "str s8, [%[outptr0]]\n"
+ "fmla v0.4s, v24.4s, v12.4s\n"
+ "str s7, [%[outptr0], %[output_col_stride1]]\n"
+ "fmin v6.4s, v6.4s, v29.4s\n"
+ "fmax v5.4s, v5.4s, v30.4s\n"
+ "ldr s10, [%[wbptr], #32]\n"
+ "str s6, [%[outptr0], x27]\n"
+ "fmla v0.4s, v21.4s, v9.4s\n"
+ "fmin v5.4s, v5.4s, v29.4s\n"
+ "ldr s12, [%[wbptr], #24]\n"
+ "fmax v4.4s, v4.4s, v30.4s\n"
+ "ldr s28, [%[inptr0], x18]\n"
+ "str s5, [x25]\n"
+ "fmax v3.4s, v3.4s, v30.4s\n"
+ "fmin v4.4s, v4.4s, v29.4s\n"
+ "ldr s9, [%[wbptr], #36]\n"
+ "fmin v3.4s, v3.4s, v29.4s\n"
+ "ldr s23, [x21]\n"
+ "str s4, [x25, %[output_col_stride1]]\n"
+ "fmax v2.4s, v2.4s, v30.4s\n"
+ "str s3, [x25, x27]\n"
+ "fmax v1.4s, v1.4s, v30.4s\n"
+ "fmin v2.4s, v2.4s, v29.4s\n"
+ "ldr s18, [x14, %[input_col_stride1]]\n"
+ "fmin v1.4s, v1.4s, v29.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "str s2, [x13]\n"
+ "fmax v0.4s, v0.4s, v30.4s\n"
+ "str s1, [x13, %[output_col_stride1]]\n"
+ "mov v8.16b, v19.16b\n"
+ "fmin v0.4s, v0.4s, v29.4s\n"
+ "add x25, x25, #4\n"
+ "mov v5.16b, v19.16b\n"
+ "mov v7.16b, v19.16b\n"
+ "str s0, [x13, x27]\n"
+ "mov v2.16b, v19.16b\n"
+ "mov v4.16b, v19.16b\n"
+ "add x13, x13, #4\n"
+ "mov v6.16b, v19.16b\n"
+ "mov v1.16b, v19.16b\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v0.16b, v19.16b\n"
+ "fmla v8.4s, v25.4s, v17.4s\n"
+ "fmla v8.4s, v27.4s, v14.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v5.4s, v27.4s, v17.4s\n"
+ "ldr s27, [x17, x18]\n"
+ "fmla v8.4s, v26.4s, v16.4s\n"
+ "ldr s30, [%[inptr0], x15]\n"
+ "fmla v7.4s, v26.4s, v17.4s\n"
+ "ldr s31, [x26]\n"
+ "fmla v5.4s, v20.4s, v14.4s\n"
+ "ldr s24, [x21, %[input_col_stride1]]\n"
+ "fmla v8.4s, v20.4s, v11.4s\n"
+ "prfm pldl1keep, [x17, x22]\n"
+ "fmla v2.4s, v20.4s, v17.4s\n"
+ "ldr s29, [x14, x18]\n"
+ "fmla v5.4s, v22.4s, v16.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v8.4s, v22.4s, v13.4s\n"
+ "prfm pldl1keep, [x26, #64]\n"
+ "fmla v7.4s, v22.4s, v14.4s\n"
+ "prfm pldl1keep, [x21, x28]\n"
+ "fmla v4.4s, v22.4s, v17.4s\n"
+ "ldr s21, [x17, x15]\n"
+ "fmla v8.4s, v28.4s, v15.4s\n"
+ "prfm pldl1keep, [x14, x22]\n"
+ "fmla v7.4s, v28.4s, v16.4s\n"
+ "prfm pldl1keep, [x17, x16]\n"
+ "fmla v6.4s, v28.4s, v17.4s\n"
+ "ldr s19, [%[inptr0], x24]\n"
+ "fmla v5.4s, v23.4s, v11.4s\n"
+ "prfm pldl1keep, [%[inptr0], x23]\n"
+ "fmla v2.4s, v23.4s, v14.4s\n"
+ "ldr s28, [x26, %[input_col_stride1]]\n"
+ "fmla v8.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x26, x28]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "prfm pldl1keep, [x21, x22]\n"
+ "fmla v7.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x14, x16]\n"
+ "fmla v2.4s, v18.4s, v16.4s\n"
+ "prfm pldl1keep, [x17, x23]\n"
+ "fmla v4.4s, v18.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x22]\n"
+ "fmla v1.4s, v18.4s, v17.4s\n"
+ "ldr s25, [x21, x18]\n"
+ "fmla v8.4s, v27.4s, v12.4s\n"
+ "prfm pldl1keep, [x21, x16]\n"
+ "fmla v5.4s, v27.4s, v15.4s\n"
+ "prfm pldl1keep, [x14, x23]\n"
+ "fmla v7.4s, v27.4s, v13.4s\n"
+ "prfm pldl1keep, [x26, x16]\n"
+ "fmla v4.4s, v27.4s, v16.4s\n"
+ "prfm pldl1keep, [x21, x23]\n"
+ "fmla v6.4s, v27.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x23]\n"
+ "fmla v3.4s, v27.4s, v17.4s\n"
+ "ldr s27, [x14, x15]\n"
+ "fmla v7.4s, v30.4s, v15.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v6.4s, v30.4s, v16.4s\n"
+ "ldr s26, [x17, x24]\n"
+ "fmla v2.4s, v31.4s, v11.4s\n"
+ "ldr s20, [x26, x18]\n"
+ "fmla v5.4s, v24.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v4.4s, v24.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v2.4s, v24.4s, v13.4s\n"
+ "add x17, x17, #4\n"
+ "fmla v1.4s, v24.4s, v14.4s\n"
+ "ldr s18, [x21, x15]\n"
+ "fmla v8.4s, v29.4s, v9.4s\n"
+ "fmla v5.4s, v29.4s, v12.4s\n"
+ "fmla v7.4s, v29.4s, v10.4s\n"
+ "fmla v2.4s, v29.4s, v15.4s\n"
+ "fmla v4.4s, v29.4s, v13.4s\n"
+ "fmla v6.4s, v29.4s, v11.4s\n"
+ "fmla v1.4s, v29.4s, v16.4s\n"
+ "fmla v3.4s, v29.4s, v14.4s\n"
+ "fmla v0.4s, v29.4s, v17.4s\n"
+ "ldr s22, [x14, x24]\n"
+ "fmla v7.4s, v21.4s, v12.4s\n"
+ "ldr s23, [x26, x15]\n"
+ "fmla v4.4s, v21.4s, v15.4s\n"
+ "add x14, x14, #4\n"
+ "fmla v6.4s, v21.4s, v13.4s\n"
+ "fmla v3.4s, v21.4s, v16.4s\n"
+ "fmla v2.4s, v28.4s, v10.4s\n"
+ "ldr s24, [x21, x24]\n"
+ "fmla v1.4s, v28.4s, v11.4s\n"
+ "ldr s21, [x26, x24]\n"
+ "fmla v6.4s, v19.4s, v15.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v5.4s, v25.4s, v9.4s\n"
+ "add x26, x26, #4\n"
+ "fmla v2.4s, v25.4s, v12.4s\n"
+ "fmla v4.4s, v25.4s, v10.4s\n"
+ "fmla v1.4s, v25.4s, v13.4s\n"
+ "fmla v3.4s, v25.4s, v11.4s\n"
+ "fmla v0.4s, v25.4s, v14.4s\n"
+ "fmla v7.4s, v27.4s, v9.4s\n"
+ "fmla v4.4s, v27.4s, v12.4s\n"
+ "fmla v6.4s, v27.4s, v10.4s\n"
+ "fmla v1.4s, v27.4s, v15.4s\n"
+ "fmla v3.4s, v27.4s, v13.4s\n"
+ "fmla v0.4s, v27.4s, v16.4s\n"
+ "fmla v2.4s, v20.4s, v9.4s\n"
+ "fmla v6.4s, v26.4s, v12.4s\n"
+ "fmla v4.4s, v18.4s, v9.4s\n"
+ "fmla v3.4s, v26.4s, v15.4s\n"
+ "fmla v1.4s, v20.4s, v10.4s\n"
+ "fmla v0.4s, v20.4s, v11.4s\n"
+ "movi v30.16b, #0\n"
+ "fmla v6.4s, v22.4s, v9.4s\n"
+ "fmov v29.4s, #6.0\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "fmla v3.4s, v18.4s, v10.4s\n"
+ "fmla v0.4s, v18.4s, v13.4s\n"
+ "fmax v8.4s, v8.4s, v30.4s\n"
+ "fmax v7.4s, v7.4s, v30.4s\n"
+ "fmax v6.4s, v6.4s, v30.4s\n"
+ "fmla v3.4s, v22.4s, v12.4s\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "fmla v0.4s, v22.4s, v15.4s\n"
+ "fmin v8.4s, v8.4s, v29.4s\n"
+ "fmin v7.4s, v7.4s, v29.4s\n"
+ "fmin v6.4s, v6.4s, v29.4s\n"
+ "str s8, [%[outptr0]]\n"
+ "fmla v3.4s, v24.4s, v9.4s\n"
+ "str s7, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v0.4s, v23.4s, v10.4s\n"
+ "str s6, [%[outptr0], x27]\n"
+ "fmax v5.4s, v5.4s, v30.4s\n"
+ "fmax v4.4s, v4.4s, v30.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v0.4s, v24.4s, v12.4s\n"
+ "fmin v5.4s, v5.4s, v29.4s\n"
+ "fmin v4.4s, v4.4s, v29.4s\n"
+ "fmax v3.4s, v3.4s, v30.4s\n"
+ "str s5, [x25]\n"
+ "fmax v2.4s, v2.4s, v30.4s\n"
+ "str s4, [x25, %[output_col_stride1]]\n"
+ "fmla v0.4s, v21.4s, v9.4s\n"
+ "fmin v3.4s, v3.4s, v29.4s\n"
+ "fmin v2.4s, v2.4s, v29.4s\n"
+ "fmax v1.4s, v1.4s, v30.4s\n"
+ "str s3, [x25, x27]\n"
+ "str s2, [x13]\n"
+ "fmin v1.4s, v1.4s, v29.4s\n"
+ "fmax v0.4s, v0.4s, v30.4s\n"
+ "add x25, x25, #4\n"
+ "str s1, [x13, %[output_col_stride1]]\n"
+ "fmin v0.4s, v0.4s, v29.4s\n"
+ "str s0, [x13, x27]\n"
+ "add x13, x13, #4\n"
+ "7:\n"
+ : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
+ : [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
}
#endif // __aarch64__
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp16_fp16.cpp
new file mode 100644
index 0000000..8348692
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp16_fp16.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>;
+} // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index c7113d0..adc6969 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,599 +25,745 @@
namespace depthwise
{
-using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 2, 2, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
#ifdef __aarch64__
-
template <>
template <>
-void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
- const int n_channels,
- const float* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const float* const inptr,
- const int in_row_stride,
- const int in_col_stride,
- float* const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int, const int, const int, const int, const int, const int, const int, const int
+void Conv::execute_tile<ActivationFunction::None>(
+ int n_channels,
+ const void* weight_bias_ptr,
+ const float* input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float* output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
)
{
- // Copy pointers
- const float *uptr0 = inptr;
- const float *wptr0 = weights;
- float *vptr0 = outptr;
-
- int channels_remaining = n_channels;
- if (channels_remaining >= 4)
- {
- // Process blocks of 4 channels at a time
- int n_iters = channels_remaining / 4 - 1;
- channels_remaining %= 4;
-
- asm volatile(
- // Prepare aliases
- "qW13 .req q0\n" "vW13 .req v0\n"
- "qU15 .req q1\n" "qU73 .req q1\n" "qU45 .req q1\n" "qU14 .req q1\n"
- "vU15 .req v1\n" "vU73 .req v1\n" "vU45 .req v1\n" "vU14 .req v1\n"
- "qU62 .req q2\n" "qV12 .req q2\n" "vU62 .req v2\n" "vV12 .req v2\n"
- "qU51 .req q3\n" "qU43 .req q3\n" "qU55 .req q3\n"
- "vU51 .req v3\n" "vU43 .req v3\n" "vU55 .req v3\n"
- "qU77 .req q4\n" "qV13 .req q4\n" "qV31 .req q4\n" "qU44 .req q4\n"
- "vU77 .req v4\n" "vV13 .req v4\n" "vV31 .req v4\n" "vU44 .req v4\n"
- "qV33 .req q5\n" "qU46 .req q5\n" "qU11 .req q5\n" "qU37 .req q5\n"
- "vV33 .req v5\n" "vU46 .req v5\n" "vU11 .req v5\n" "vU37 .req v5\n"
- "qU56 .req q6\n" "qU25 .req q6\n" "qU32 .req q6\n"
- "vU56 .req v6\n" "vU25 .req v6\n" "vU32 .req v6\n"
- "qU72 .req q7\n" "qV22 .req q7\n" "vU72 .req v7\n" "vV22 .req v7\n"
- "qU67 .req q8\n" "qU61 .req q8\n" "qU13 .req q8\n"
- "vU67 .req v8\n" "vU61 .req v8\n" "vU13 .req v8\n"
- "qU74 .req q9\n" "qU34 .req q9\n" "qU17 .req q9\n" "qU66 .req q9\n"
- "vU74 .req v9\n" "vU34 .req v9\n" "vU17 .req v9\n" "vU66 .req v9\n"
- "qU33 .req q10\n" "qU57 .req q10\n" "qU21 .req q10\n"
- "vU33 .req v10\n" "vU57 .req v10\n" "vU21 .req v10\n" "qW23 .req q11\n"
- "vW23 .req v11\n" "qU42 .req q12\n" "qV23 .req q12\n" "qU23 .req q12\n"
- "vU42 .req v12\n" "vV23 .req v12\n" "vU23 .req v12\n"
- "qW33 .req q13\n" "vW33 .req v13\n"
- "qU76 .req q14\n" "qU47 .req q14\n" "qU64 .req q14\n" "qU41 .req q14\n"
- "vU76 .req v14\n" "vU47 .req v14\n" "vU64 .req v14\n" "vU41 .req v14\n"
- "qU52 .req q15\n" "qU54 .req q15\n" "qU75 .req q15\n" "qU26 .req q15\n"
- "vU52 .req v15\n" "vU54 .req v15\n" "vU75 .req v15\n" "vU26 .req v15\n"
- "qU53 .req q16\n" "qU27 .req q16\n" "vU53 .req v16\n" "vU27 .req v16\n"
- "qV21 .req q17\n" "qU65 .req q17\n" "vV21 .req v17\n" "vU65 .req v17\n"
- "qU31 .req q18\n" "qU24 .req q18\n" "qU36 .req q18\n"
- "vU31 .req v18\n" "vU24 .req v18\n" "vU36 .req v18\n" "qU22 .req q19\n"
- "vU22 .req v19\n" "qU35 .req q20\n" "qU63 .req q20\n"
- "vU35 .req v20\n" "vU63 .req v20\n" "qW12 .req q21\n"
- "vW12 .req v21\n" "qV32 .req q22\n" "qU16 .req q22\n"
- "vV32 .req v22\n" "vU16 .req v22\n" "qW11 .req q23\n" "vW11 .req v23\n"
- "qU12 .req q24\n" "vU12 .req v24\n" "qW31 .req q25\n" "vW31 .req v25\n"
- "qW22 .req q26\n" "vW22 .req v26\n" "qU71 .req q27\n" "vU71 .req v27\n"
- "qV11 .req q28\n" "vV11 .req v28\n" "qW21 .req q29\n" "vW21 .req v29\n"
- "qW32 .req q30\n" "vW32 .req v30\n"
-
- "uptr1 .req x0\n"
- "uptr2 .req x1\n"
- "uptr3 .req x2\n"
- "uptr4 .req x3\n"
- "uptr5 .req x4\n"
- "uptr6 .req x5\n"
- "u_col_stride1 .req %x[u_col_stride]\n"
- "u_col_stride2 .req x6\n"
- "u_col_stride3 .req x7\n"
- "u_col_stride4 .req x8\n"
- "u_col_stride5 .req x9\n"
- "u_col_stride6 .req x10\n"
- "wptr1 .req x11\n"
- "wptr2 .req x12\n"
- "w_col_stride1 .req %x[w_col_stride]\n"
- "w_col_stride2 .req x13\n"
- "vptr1 .req x14\n"
- "vptr2 .req x15\n"
- "v_col_stride1 .req %x[v_col_stride]\n"
- "v_col_stride2 .req x16\n"
-
- // Prepare strides and pointers
- "add uptr1, %x[uptr0], %x[u_row_stride]\n"
- "add uptr2, uptr1 , %x[u_row_stride]\n"
- "add uptr3, uptr2 , %x[u_row_stride]\n"
- "add uptr4, uptr3 , %x[u_row_stride]\n"
- "add uptr5, uptr4 , %x[u_row_stride]\n"
- "add uptr6, uptr5 , %x[u_row_stride]\n"
- "add u_col_stride2, u_col_stride1, u_col_stride1\n"
- "add u_col_stride3, u_col_stride2, u_col_stride1\n"
- "add u_col_stride4, u_col_stride3, u_col_stride1\n"
- "add u_col_stride5, u_col_stride4, u_col_stride1\n"
- "add u_col_stride6, u_col_stride5, u_col_stride1\n"
-
- "add wptr1, %x[wptr0], %x[w_row_stride]\n"
- "add wptr2, wptr1 , %x[w_row_stride]\n"
- "add w_col_stride2, w_col_stride1, w_col_stride1\n"
-
- "add vptr1, %x[vptr0], %x[v_row_stride]\n"
- "add vptr2, vptr1 , %x[v_row_stride]\n"
- "add v_col_stride2, v_col_stride1, v_col_stride1\n"
-
- // Prepare for first iteration
- "ldr qW13, [%x[wptr0], w_col_stride2]\n"
- "ldr qW23, [wptr1, w_col_stride2]\n"
- "ldr qW33, [wptr2, w_col_stride2]\n"
- "ldr qW12, [%x[wptr0], w_col_stride1]\n"
- "ldr qW22, [wptr1, w_col_stride1]\n"
- "ldr qW32, [wptr2, w_col_stride1]\n"
- "ldr qW11, [%x[wptr0]], #0x10\n"
- "ldr qW21, [wptr1], #0x10\n"
- "ldr qU17, [%x[uptr0], u_col_stride6]\n"
- "ldr qU15, [%x[uptr0], u_col_stride4]\n"
- "ldr qU16, [%x[uptr0], u_col_stride5]\n"
- "ldr qU37, [uptr2, u_col_stride6]\n"
- "ldr qU35, [uptr2, u_col_stride4]\n"
- "ldr qU36, [uptr2, u_col_stride5]\n"
- "ldr qU27, [uptr1, u_col_stride6]\n"
- "ldr qU25, [uptr1, u_col_stride4]\n"
- "fmul vV13.4s, vU17.4s, vW13.4s\n"
- "fmul vV12.4s, vU15.4s, vW13.4s\n"
- "fmla vV13.4s, vU15.4s, vW11.4s\n"
- "ldr qW31, [wptr2], #0x10\n"
- "fmla vV13.4s, vU16.4s, vW12.4s\n"
- "ldr qU26, [uptr1, u_col_stride5]\n"
- "fmla vV13.4s, vU37.4s, vW33.4s\n"
- "ldr qU47, [uptr3, u_col_stride6]\n"
- "fmul vV23.4s, vU37.4s, vW13.4s\n"
- "ldr qU45, [uptr3, u_col_stride4]\n"
- "fmla vV12.4s, vU35.4s, vW33.4s\n"
- "ldr qU46, [uptr3, u_col_stride5]\n"
- "fmla vV13.4s, vU35.4s, vW31.4s\n"
- "ldr qU67, [uptr5, u_col_stride6]\n"
- "fmul vV22.4s, vU35.4s, vW13.4s\n"
- "cbz %x[n_iters], 2f\n" // Jump to tail if no iterations
-
- "1:" // Loop body
- "fmla vV23.4s, vU35.4s, vW11.4s\n"
- "ldr qU65, [uptr5, u_col_stride4]\n"
- "fmla vV13.4s, vU36.4s, vW32.4s\n"
- "fmla vV23.4s, vU36.4s, vW12.4s\n"
- "ldr qU66, [uptr5, u_col_stride5]\n"
- "fmla vV13.4s, vU27.4s, vW23.4s\n"
- "ldr qU57, [uptr4, u_col_stride6]\n"
- "fmla vV12.4s, vU25.4s, vW23.4s\n"
- "ldr qU55, [uptr4, u_col_stride4]\n"
- "fmla vV13.4s, vU25.4s, vW21.4s\n"
- "ldr qU56, [uptr4, u_col_stride5]\n"
- "fmla vV13.4s, vU26.4s, vW22.4s\n"
- "str qV13, [%x[vptr0], v_col_stride2]\n"
- "fmla vV23.4s, vU47.4s, vW23.4s\n"
- "ldr qU77, [uptr6, u_col_stride6]\n"
- "fmla vV22.4s, vU45.4s, vW23.4s\n"
- "fmla vV23.4s, vU45.4s, vW21.4s\n"
- "ldr qU75, [uptr6, u_col_stride4]\n"
- "fmla vV23.4s, vU46.4s, vW22.4s\n"
- "ldr qU76, [uptr6, u_col_stride5]\n"
- "fmul vV33.4s, vU67.4s, vW23.4s\n"
- "ldr qU14, [%x[uptr0], u_col_stride3]\n"
- "fmul vV32.4s, vU65.4s, vW23.4s\n"
- "fmla vV33.4s, vU65.4s, vW21.4s\n"
- "ldr qU13, [%x[uptr0], u_col_stride2]\n"
- "fmla vV33.4s, vU66.4s, vW22.4s\n"
- "ldr qU34, [uptr2, u_col_stride3]\n"
- "fmla vV23.4s, vU57.4s, vW33.4s\n"
- "fmla vV33.4s, vU57.4s, vW13.4s\n"
- "ldr qU33, [uptr2, u_col_stride2]\n"
- "fmla vV22.4s, vU55.4s, vW33.4s\n"
- "fmla vV23.4s, vU55.4s, vW31.4s\n"
- "fmla vV32.4s, vU55.4s, vW13.4s\n"
- "fmla vV33.4s, vU55.4s, vW11.4s\n"
- "ldr qU24, [uptr1, u_col_stride3]\n"
- "fmla vV23.4s, vU56.4s, vW32.4s\n"
- "str qV23, [vptr1, v_col_stride2]\n"
- "fmla vV33.4s, vU56.4s, vW12.4s\n"
- "ldr qU23, [uptr1, u_col_stride2]\n"
- "fmla vV33.4s, vU77.4s, vW33.4s\n"
- "ldr qU44, [uptr3, u_col_stride3]\n"
- "fmla vV32.4s, vU75.4s, vW33.4s\n"
- "fmla vV33.4s, vU75.4s, vW31.4s\n"
- "ldr qU43, [uptr3, u_col_stride2]\n"
- "fmla vV33.4s, vU76.4s, vW32.4s\n"
- "str qV33, [vptr2, v_col_stride2]\n"
- "ldr qU64, [uptr5, u_col_stride3]\n"
- "fmla vV12.4s, vU14.4s, vW12.4s\n"
- "ldr qU63, [uptr5, u_col_stride2]\n"
- "fmul vV11.4s, vU13.4s, vW13.4s\n"
- "fmla vV12.4s, vU13.4s, vW11.4s\n"
- "ldr qU54, [uptr4, u_col_stride3]\n"
- "fmla vV12.4s, vU34.4s, vW32.4s\n"
- "fmla vV22.4s, vU34.4s, vW12.4s\n"
- "ldr qU53, [uptr4, u_col_stride2]\n"
- "fmla vV11.4s, vU33.4s, vW33.4s\n"
- "ldr qU74, [uptr6, u_col_stride3]\n"
- "fmla vV12.4s, vU33.4s, vW31.4s\n"
- "ldr qU73, [uptr6, u_col_stride2]\n"
- "fmul vV21.4s, vU33.4s, vW13.4s\n"
- "ldr qU12, [%x[uptr0], u_col_stride1]\n"
- "fmla vV22.4s, vU33.4s, vW11.4s\n"
- "ldr qU11, [%x[uptr0]], #0x10\n"
- "fmla vV12.4s, vU24.4s, vW22.4s\n"
- "ldr qU32, [uptr2, u_col_stride1]\n"
- "fmla vV11.4s, vU23.4s, vW23.4s\n"
- "ldr qU31, [uptr2], #0x10\n"
- "fmla vV12.4s, vU23.4s, vW21.4s\n"
- "str qV12, [%x[vptr0], v_col_stride1]\n"
- "fmla vV22.4s, vU44.4s, vW22.4s\n"
- "ldr qU22, [uptr1, u_col_stride1]\n"
- "fmla vV21.4s, vU43.4s, vW23.4s\n"
- "ldr qU21, [uptr1], #0x10\n"
- "fmla vV22.4s, vU43.4s, vW21.4s\n"
- "ldr qU42, [uptr3, u_col_stride1]\n"
- "fmla vV32.4s, vU64.4s, vW22.4s\n"
- "ldr qU41, [uptr3], #0x10\n"
- "fmul vV31.4s, vU63.4s, vW23.4s\n"
- "ldr qW23, [wptr1, w_col_stride2]\n"
- "fmla vV32.4s, vU63.4s, vW21.4s\n"
- "ldr qU62, [uptr5, u_col_stride1]\n"
- "fmla vV22.4s, vU54.4s, vW32.4s\n"
- "ldr qU61, [uptr5], #0x10\n"
- "fmla vV32.4s, vU54.4s, vW12.4s\n"
- "ldr qU52, [uptr4, u_col_stride1]\n"
- "fmla vV21.4s, vU53.4s, vW33.4s\n"
- "ldr qU51, [uptr4], #0x10\n"
- "fmla vV22.4s, vU53.4s, vW31.4s\n"
- "str qV22, [vptr1, v_col_stride1]\n"
- "fmla vV31.4s, vU53.4s, vW13.4s\n"
- "ldr qW13, [%x[wptr0], w_col_stride2]\n"
- "fmla vV32.4s, vU53.4s, vW11.4s\n"
- "ldr qU72, [uptr6, u_col_stride1]\n"
- "fmla vV32.4s, vU74.4s, vW32.4s\n"
- "ldr qU71, [uptr6], #0x10\n"
- "fmla vV31.4s, vU73.4s, vW33.4s\n"
- "ldr qW33, [wptr2, w_col_stride2]\n"
- "fmla vV32.4s, vU73.4s, vW31.4s\n"
- "str qV32, [vptr2, v_col_stride1]\n"
- "fmla vV11.4s, vU12.4s, vW12.4s\n"
- "ldr qU17, [%x[uptr0], u_col_stride6]\n"
- "fmla vV11.4s, vU11.4s, vW11.4s\n"
- "ldr qU15, [%x[uptr0], u_col_stride4]\n"
- "fmla vV11.4s, vU32.4s, vW32.4s\n"
- "ldr qU16, [%x[uptr0], u_col_stride5]\n"
- "fmla vV21.4s, vU32.4s, vW12.4s\n"
- "ldr qU37, [uptr2, u_col_stride6]\n"
- "fmla vV11.4s, vU31.4s, vW31.4s\n"
- "ldr qU35, [uptr2, u_col_stride4]\n"
- "fmla vV21.4s, vU31.4s, vW11.4s\n"
- "ldr qU36, [uptr2, u_col_stride5]\n"
- "fmla vV11.4s, vU22.4s, vW22.4s\n"
- "ldr qU27, [uptr1, u_col_stride6]\n"
- "fmla vV11.4s, vU21.4s, vW21.4s\n"
- "str qV11, [%x[vptr0]], #0x10\n"
- "fmla vV21.4s, vU42.4s, vW22.4s\n"
- "ldr qU25, [uptr1, u_col_stride4]\n"
- "fmla vV21.4s, vU41.4s, vW21.4s\n"
- "fmla vV31.4s, vU62.4s, vW22.4s\n"
- "ldr qW22, [wptr1, w_col_stride1]\n"
- "fmla vV31.4s, vU61.4s, vW21.4s\n"
- "ldr qW21, [wptr1], #0x10\n"
- "fmla vV21.4s, vU52.4s, vW32.4s\n"
- "fmla vV31.4s, vU52.4s, vW12.4s\n"
- "ldr qW12, [%x[wptr0], w_col_stride1]\n"
- "fmla vV21.4s, vU51.4s, vW31.4s\n"
- "str qV21, [vptr1], #0x10\n"
- "fmla vV31.4s, vU51.4s, vW11.4s\n"
- "ldr qW11, [%x[wptr0]], #0x10\n"
- "fmla vV31.4s, vU72.4s, vW32.4s\n"
- "ldr qW32, [wptr2, w_col_stride1]\n"
- "fmla vV31.4s, vU71.4s, vW31.4s\n"
- "str qV31, [vptr2], #0x10\n"
- "fmul vV13.4s, vU17.4s, vW13.4s\n"
- "fmul vV12.4s, vU15.4s, vW13.4s\n"
- "subs %x[n_iters], %x[n_iters], #1\n"
- "fmla vV13.4s, vU15.4s, vW11.4s\n"
- "ldr qW31, [wptr2], #0x10\n"
- "fmla vV13.4s, vU16.4s, vW12.4s\n"
- "ldr qU26, [uptr1, u_col_stride5]\n"
- "fmla vV13.4s, vU37.4s, vW33.4s\n"
- "ldr qU47, [uptr3, u_col_stride6]\n"
- "fmul vV23.4s, vU37.4s, vW13.4s\n"
- "ldr qU45, [uptr3, u_col_stride4]\n"
- "fmla vV12.4s, vU35.4s, vW33.4s\n"
- "ldr qU46, [uptr3, u_col_stride5]\n"
- "fmla vV13.4s, vU35.4s, vW31.4s\n"
- "ldr qU67, [uptr5, u_col_stride6]\n"
- "fmul vV22.4s, vU35.4s, vW13.4s\n"
- "bne 1b\n"
-
- "2:" // Tail iteration
- "fmla vV23.4s, vU35.4s, vW11.4s\n"
- "ldr qU65, [uptr5, u_col_stride4]\n"
- "fmla vV13.4s, vU36.4s, vW32.4s\n"
- "fmla vV23.4s, vU36.4s, vW12.4s\n"
- "ldr qU66, [uptr5, u_col_stride5]\n"
- "fmla vV13.4s, vU27.4s, vW23.4s\n"
- "ldr qU57, [uptr4, u_col_stride6]\n"
- "fmla vV12.4s, vU25.4s, vW23.4s\n"
- "ldr qU55, [uptr4, u_col_stride4]\n"
- "fmla vV13.4s, vU25.4s, vW21.4s\n"
- "ldr qU56, [uptr4, u_col_stride5]\n"
- "fmla vV13.4s, vU26.4s, vW22.4s\n"
- "str qV13, [%x[vptr0], v_col_stride2]\n"
- "fmla vV23.4s, vU47.4s, vW23.4s\n"
- "ldr qU77, [uptr6, u_col_stride6]\n"
- "fmla vV22.4s, vU45.4s, vW23.4s\n"
- "fmla vV23.4s, vU45.4s, vW21.4s\n"
- "ldr qU75, [uptr6, u_col_stride4]\n"
- "fmla vV23.4s, vU46.4s, vW22.4s\n"
- "ldr qU76, [uptr6, u_col_stride5]\n"
- "fmul vV33.4s, vU67.4s, vW23.4s\n"
- "ldr qU14, [%x[uptr0], u_col_stride3]\n"
- "fmul vV32.4s, vU65.4s, vW23.4s\n"
- "fmla vV33.4s, vU65.4s, vW21.4s\n"
- "ldr qU13, [%x[uptr0], u_col_stride2]\n"
- "fmla vV33.4s, vU66.4s, vW22.4s\n"
- "ldr qU34, [uptr2, u_col_stride3]\n"
- "fmla vV23.4s, vU57.4s, vW33.4s\n"
- "fmla vV33.4s, vU57.4s, vW13.4s\n"
- "ldr qU33, [uptr2, u_col_stride2]\n"
- "fmla vV22.4s, vU55.4s, vW33.4s\n"
- "fmla vV23.4s, vU55.4s, vW31.4s\n"
- "fmla vV32.4s, vU55.4s, vW13.4s\n"
- "fmla vV33.4s, vU55.4s, vW11.4s\n"
- "ldr qU24, [uptr1, u_col_stride3]\n"
- "fmla vV23.4s, vU56.4s, vW32.4s\n"
- "str qV23, [vptr1, v_col_stride2]\n"
- "fmla vV33.4s, vU56.4s, vW12.4s\n"
- "ldr qU23, [uptr1, u_col_stride2]\n"
- "fmla vV33.4s, vU77.4s, vW33.4s\n"
- "ldr qU44, [uptr3, u_col_stride3]\n"
- "fmla vV32.4s, vU75.4s, vW33.4s\n"
- "fmla vV33.4s, vU75.4s, vW31.4s\n"
- "ldr qU43, [uptr3, u_col_stride2]\n"
- "fmla vV33.4s, vU76.4s, vW32.4s\n"
- "str qV33, [vptr2, v_col_stride2]\n"
- "ldr qU64, [uptr5, u_col_stride3]\n"
- "fmla vV12.4s, vU14.4s, vW12.4s\n"
- "ldr qU63, [uptr5, u_col_stride2]\n"
- "fmul vV11.4s, vU13.4s, vW13.4s\n"
- "fmla vV12.4s, vU13.4s, vW11.4s\n"
- "ldr qU54, [uptr4, u_col_stride3]\n"
- "fmla vV12.4s, vU34.4s, vW32.4s\n"
- "fmla vV22.4s, vU34.4s, vW12.4s\n"
- "ldr qU53, [uptr4, u_col_stride2]\n"
- "fmla vV11.4s, vU33.4s, vW33.4s\n"
- "ldr qU74, [uptr6, u_col_stride3]\n"
- "fmla vV12.4s, vU33.4s, vW31.4s\n"
- "ldr qU73, [uptr6, u_col_stride2]\n"
- "fmul vV21.4s, vU33.4s, vW13.4s\n"
- "ldr qU12, [%x[uptr0], u_col_stride1]\n"
- "fmla vV22.4s, vU33.4s, vW11.4s\n"
- "ldr qU11, [%x[uptr0]], #0x10\n"
- "fmla vV12.4s, vU24.4s, vW22.4s\n"
- "ldr qU32, [uptr2, u_col_stride1]\n"
- "fmla vV11.4s, vU23.4s, vW23.4s\n"
- "ldr qU31, [uptr2], #0x10\n"
- "fmla vV12.4s, vU23.4s, vW21.4s\n"
- "str qV12, [%x[vptr0], v_col_stride1]\n"
- "fmla vV22.4s, vU44.4s, vW22.4s\n"
- "ldr qU22, [uptr1, u_col_stride1]\n"
- "fmla vV21.4s, vU43.4s, vW23.4s\n"
- "ldr qU21, [uptr1], #0x10\n"
- "fmla vV22.4s, vU43.4s, vW21.4s\n"
- "ldr qU42, [uptr3, u_col_stride1]\n"
- "fmla vV32.4s, vU64.4s, vW22.4s\n"
- "ldr qU41, [uptr3], #0x10\n"
- "fmul vV31.4s, vU63.4s, vW23.4s\n"
- "fmla vV32.4s, vU63.4s, vW21.4s\n"
- "ldr qU62, [uptr5, u_col_stride1]\n"
- "fmla vV22.4s, vU54.4s, vW32.4s\n"
- "ldr qU61, [uptr5], #0x10\n"
- "fmla vV32.4s, vU54.4s, vW12.4s\n"
- "ldr qU52, [uptr4, u_col_stride1]\n"
- "fmla vV21.4s, vU53.4s, vW33.4s\n"
- "ldr qU51, [uptr4], #0x10\n"
- "fmla vV22.4s, vU53.4s, vW31.4s\n"
- "str qV22, [vptr1, v_col_stride1]\n"
- "fmla vV31.4s, vU53.4s, vW13.4s\n"
- "fmla vV32.4s, vU53.4s, vW11.4s\n"
- "ldr qU72, [uptr6, u_col_stride1]\n"
- "fmla vV32.4s, vU74.4s, vW32.4s\n"
- "ldr qU71, [uptr6], #0x10\n"
- "fmla vV31.4s, vU73.4s, vW33.4s\n"
- "fmla vV32.4s, vU73.4s, vW31.4s\n"
- "str qV32, [vptr2, v_col_stride1]\n"
- "fmla vV11.4s, vU12.4s, vW12.4s\n"
- "fmla vV11.4s, vU11.4s, vW11.4s\n"
- "fmla vV11.4s, vU32.4s, vW32.4s\n"
- "fmla vV21.4s, vU32.4s, vW12.4s\n"
- "fmla vV11.4s, vU31.4s, vW31.4s\n"
- "fmla vV21.4s, vU31.4s, vW11.4s\n"
- "fmla vV11.4s, vU22.4s, vW22.4s\n"
- "fmla vV11.4s, vU21.4s, vW21.4s\n"
- "str qV11, [%x[vptr0]], #0x10\n"
- "fmla vV21.4s, vU42.4s, vW22.4s\n"
- "fmla vV21.4s, vU41.4s, vW21.4s\n"
- "fmla vV31.4s, vU62.4s, vW22.4s\n"
- "fmla vV31.4s, vU61.4s, vW21.4s\n"
- "fmla vV21.4s, vU52.4s, vW32.4s\n"
- "fmla vV31.4s, vU52.4s, vW12.4s\n"
- "fmla vV21.4s, vU51.4s, vW31.4s\n"
- "str qV21, [vptr1], #0x10\n"
- "fmla vV31.4s, vU51.4s, vW11.4s\n"
- "fmla vV31.4s, vU72.4s, vW32.4s\n"
- "fmla vV31.4s, vU71.4s, vW31.4s\n"
- "str qV31, [vptr2], #0x10\n"
-
- // Clear aliases
- ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n"
- ".unreq uptr5\n" ".unreq uptr6\n"
- ".unreq u_col_stride1\n" ".unreq u_col_stride2\n" ".unreq u_col_stride3\n"
- ".unreq u_col_stride4\n" ".unreq u_col_stride5\n" ".unreq u_col_stride6\n"
- ".unreq wptr1\n" ".unreq wptr2\n"
- ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
- ".unreq vptr1\n" ".unreq vptr2\n"
- ".unreq v_col_stride1\n" ".unreq v_col_stride2\n"
- ".unreq qU15\n" ".unreq qU73\n" ".unreq qU45\n" ".unreq qU14\n"
- ".unreq qW13\n" ".unreq qU62\n" ".unreq qV12\n"
- ".unreq qU51\n" ".unreq qU43\n" ".unreq qU55\n"
- ".unreq qU77\n" ".unreq qV13\n" ".unreq qV31\n" ".unreq qU44\n"
- ".unreq qV33\n" ".unreq qU46\n" ".unreq qU11\n" ".unreq qU37\n"
- ".unreq qU56\n" ".unreq qU25\n" ".unreq qU32\n"
- ".unreq qU72\n" ".unreq qV22\n"
- ".unreq qU67\n" ".unreq qU61\n" ".unreq qU13\n" ".unreq qW33\n"
- ".unreq qU74\n" ".unreq qU34\n" ".unreq qU17\n" ".unreq qU66\n"
- ".unreq qU33\n" ".unreq qU57\n" ".unreq qU21\n"
- ".unreq qW23\n" ".unreq qU42\n" ".unreq qV23\n" ".unreq qU23\n"
- ".unreq qU76\n" ".unreq qU47\n" ".unreq qU64\n" ".unreq qU41\n"
- ".unreq qU52\n" ".unreq qU54\n" ".unreq qU75\n" ".unreq qU26\n"
- ".unreq qU53\n" ".unreq qU27\n"
- ".unreq qV21\n" ".unreq qU65\n"
- ".unreq qU31\n" ".unreq qU24\n" ".unreq qU36\n" ".unreq qU22\n"
- ".unreq qU35\n" ".unreq qU63\n" ".unreq qW12\n"
- ".unreq qV32\n" ".unreq qU16\n" ".unreq qW11\n" ".unreq qU12\n"
- ".unreq qW31\n" ".unreq qW22\n" ".unreq qU71\n" ".unreq qV11\n"
- ".unreq qW21\n" ".unreq qW32\n" ".unreq vW13\n"
- ".unreq vU15\n" ".unreq vU73\n" ".unreq vU45\n" ".unreq vU14\n"
- ".unreq vU62\n" ".unreq vV12\n"
- ".unreq vU51\n" ".unreq vU43\n" ".unreq vU55\n"
- ".unreq vU77\n" ".unreq vV13\n" ".unreq vV31\n" ".unreq vU44\n"
- ".unreq vV33\n" ".unreq vU46\n" ".unreq vU11\n" ".unreq vU37\n"
- ".unreq vU56\n" ".unreq vU25\n" ".unreq vU32\n"
- ".unreq vU72\n" ".unreq vV22\n" ".unreq vW21\n" ".unreq vW32\n"
- ".unreq vU67\n" ".unreq vU61\n" ".unreq vU13\n"
- ".unreq vU74\n" ".unreq vU34\n" ".unreq vU17\n" ".unreq vU66\n"
- ".unreq vU33\n" ".unreq vU57\n" ".unreq vU21\n" ".unreq vW23\n"
- ".unreq vU42\n" ".unreq vV23\n" ".unreq vU23\n" ".unreq vW33\n"
- ".unreq vU76\n" ".unreq vU47\n" ".unreq vU64\n" ".unreq vU41\n"
- ".unreq vU52\n" ".unreq vU54\n" ".unreq vU75\n" ".unreq vU26\n"
- ".unreq vU53\n" ".unreq vU27\n" ".unreq vV21\n" ".unreq vU65\n"
- ".unreq vU31\n" ".unreq vU24\n" ".unreq vU36\n" ".unreq vU22\n"
- ".unreq vU35\n" ".unreq vU63\n" ".unreq vW12\n"
- ".unreq vV32\n" ".unreq vU16\n" ".unreq vW11\n" ".unreq vU12\n"
- ".unreq vW31\n" ".unreq vW22\n" ".unreq vU71\n" ".unreq vV11\n"
- : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0),
- [n_iters] "+r" (n_iters)
- : [u_row_stride] "r" (in_row_stride * sizeof(float)),
- [u_col_stride] "r" (in_col_stride * sizeof(float)),
- [w_row_stride] "r" (weight_row_stride * sizeof(float)),
- [w_col_stride] "r" (weight_col_stride * sizeof(float)),
- [v_row_stride] "r" (out_row_stride * sizeof(float)),
- [v_col_stride] "r" (out_col_stride * sizeof(float))
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
- "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
- "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0",
- "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
- "x12", "x13", "x14", "x15", "x16", "cc", "memory"
- );
- }
- if (channels_remaining)
- {
- // Fall back on the unoptimised version to clean up the tail
- ConvImpl::process_tile<false>(
- channels_remaining,
- wptr0, weight_row_stride, weight_col_stride,
- uptr0, in_row_stride, in_col_stride,
- vptr0, out_row_stride, out_col_stride,
- 0, 0, 0, 0, 0, 0
- );
- }
+ __asm __volatile(
+ "add x15, %[inptr0], %[input_row_stride]\n"
+ "add x26, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x21, %[outptr0], %[output_row_stride]\n"
+ "add x16, x15, %[input_row_stride]\n"
+ "add x27, x26, %[input_col_stride1]\n"
+ "add x22, x21, %[output_row_stride]\n"
+ "add x17, x16, %[input_row_stride]\n"
+ "add x28, x27, %[input_col_stride1]\n"
+ "add x23, %[output_col_stride1], %[output_col_stride1]\n"
+ "add x18, x17, %[input_row_stride]\n"
+ "add x13, x28, %[input_col_stride1]\n"
+ "and x24, %[n_channels], #3\n"
+ "add x19, x18, %[input_row_stride]\n"
+ "add x14, x13, %[input_col_stride1]\n"
+ "lsr x25, %[n_channels], #2\n"
+ "add x20, x19, %[input_row_stride]\n"
+ "cbz x25, 4f\n"
+ "1:\n"
+ "ldr q27, [%[wbptr]]\n"
+ "subs x25, x25, #1\n"
+ "mov v17.16b, v27.16b\n"
+ "ldr q6, [%[wbptr], #16]\n"
+ "mov v16.16b, v27.16b\n"
+ "ldr q14, [%[wbptr], #32]\n"
+ "mov v15.16b, v27.16b\n"
+ "ldr q13, [%[wbptr], #48]\n"
+ "mov v2.16b, v27.16b\n"
+ "ldr q12, [%[wbptr], #64]\n"
+ "mov v4.16b, v27.16b\n"
+ "ldr q11, [%[wbptr], #80]\n"
+ "mov v5.16b, v27.16b\n"
+ "ldr q10, [%[wbptr], #96]\n"
+ "mov v1.16b, v27.16b\n"
+ "ldr q9, [%[wbptr], #112]\n"
+ "mov v3.16b, v27.16b\n"
+ "ldr q8, [%[wbptr], #128]\n"
+ "mov v0.16b, v27.16b\n"
+ "ldr q7, [%[wbptr], #144]\n"
+ "ldr q29, [%[inptr0]]\n"
+ "ldr q28, [x15]\n"
+ "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr q22, [x16]\n"
+ "ldr q20, [x15, %[input_col_stride1]]\n"
+ "ldr q19, [%[inptr0], x26]\n"
+ "ldr q30, [x17]\n"
+ "ldr q18, [x16, %[input_col_stride1]]\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v17.4s, v29.4s, v6.4s\n"
+ "ldr q21, [x15, x26]\n"
+ "fmla v16.4s, v22.4s, v6.4s\n"
+ "ldr q27, [%[inptr0], x27]\n"
+ "fmla v15.4s, v19.4s, v6.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v17.4s, v28.4s, v12.4s\n"
+ "ldr q25, [x18]\n"
+ "fmla v16.4s, v30.4s, v12.4s\n"
+ "ldr q24, [x17, %[input_col_stride1]]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v17.4s, v26.4s, v14.4s\n"
+ "ldr q23, [x16, x26]\n"
+ "fmla v16.4s, v18.4s, v14.4s\n"
+ "subs x25, x25, #1\n"
+ "fmla v15.4s, v27.4s, v14.4s\n"
+ "ldr q26, [x15, x27]\n"
+ "fmla v17.4s, v22.4s, v9.4s\n"
+ "ldr q22, [%[inptr0], x28]\n"
+ "fmla v16.4s, v25.4s, v9.4s\n"
+ "fmla v2.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v23.4s, v9.4s\n"
+ "ldr q30, [x19]\n"
+ "fmla v17.4s, v20.4s, v11.4s\n"
+ "ldr q29, [x18, %[input_col_stride1]]\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "ldr q28, [x17, x26]\n"
+ "fmla v4.4s, v23.4s, v6.4s\n"
+ "fmla v15.4s, v26.4s, v11.4s\n"
+ "fmla v17.4s, v19.4s, v13.4s\n"
+ "ldr q24, [x16, x27]\n"
+ "fmla v16.4s, v23.4s, v13.4s\n"
+ "ldr q25, [x15, x28]\n"
+ "fmla v15.4s, v22.4s, v13.4s\n"
+ "fmla v5.4s, v22.4s, v6.4s\n"
+ "fmla v17.4s, v18.4s, v8.4s\n"
+ "ldr q19, [%[inptr0], x13]\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "ldr q18, [x20]\n"
+ "fmla v16.4s, v29.4s, v8.4s\n"
+ "ldr q22, [x19, %[input_col_stride1]]\n"
+ "fmla v17.4s, v21.4s, v10.4s\n"
+ "ldr q26, [x18, x26]\n"
+ "fmla v2.4s, v29.4s, v14.4s\n"
+ "ldr q20, [x17, x27]\n"
+ "fmla v16.4s, v28.4s, v10.4s\n"
+ "fmla v4.4s, v28.4s, v12.4s\n"
+ "fmla v17.4s, v23.4s, v7.4s\n"
+ "ldr q27, [x16, x28]\n"
+ "fmla v15.4s, v24.4s, v8.4s\n"
+ "ldr q30, [x15, x13]\n"
+ "fmla v4.4s, v24.4s, v14.4s\n"
+ "ldr q24, [%[inptr0], x14]\n"
+ "str q17, [%[outptr0]]\n"
+ "fmla v5.4s, v25.4s, v12.4s\n"
+ "fmla v15.4s, v25.4s, v10.4s\n"
+ "ldr q28, [x20, %[input_col_stride1]]\n"
+ "fmla v2.4s, v18.4s, v9.4s\n"
+ "ldr q17, [x19, x26]\n"
+ "fmla v5.4s, v19.4s, v14.4s\n"
+ "ldr q18, [x18, x27]\n"
+ "fmla v16.4s, v26.4s, v7.4s\n"
+ "ldr q25, [x17, x28]\n"
+ "fmla v2.4s, v22.4s, v11.4s\n"
+ "ldr q22, [x16, x13]\n"
+ "fmla v4.4s, v26.4s, v9.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "str q16, [x21]\n"
+ "fmla v1.4s, v26.4s, v6.4s\n"
+ "fmla v2.4s, v26.4s, v13.4s\n"
+ "ldr q21, [x15, x14]\n"
+ "fmla v4.4s, v20.4s, v11.4s\n"
+ "ldr q23, [x20, x26]\n"
+ "fmla v15.4s, v27.4s, v7.4s\n"
+ "ldr q19, [x19, x27]\n"
+ "fmla v5.4s, v27.4s, v9.4s\n"
+ "add x15, x15, #16\n"
+ "fmla v4.4s, v27.4s, v13.4s\n"
+ "fmla v3.4s, v27.4s, v6.4s\n"
+ "str q15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v2.4s, v28.4s, v8.4s\n"
+ "fmla v5.4s, v30.4s, v11.4s\n"
+ "ldr q29, [x18, x28]\n"
+ "fmla v1.4s, v17.4s, v12.4s\n"
+ "ldr q27, [x17, x13]\n"
+ "fmla v2.4s, v17.4s, v10.4s\n"
+ "ldr q28, [x16, x14]\n"
+ "fmla v5.4s, v24.4s, v13.4s\n"
+ "ldr q26, [x20, x27]\n"
+ "fmla v4.4s, v18.4s, v8.4s\n"
+ "ldr q20, [x19, x28]\n"
+ "fmla v1.4s, v18.4s, v14.4s\n"
+ "ldr q17, [x18, x13]\n"
+ "fmla v3.4s, v25.4s, v12.4s\n"
+ "ldr q18, [x17, x14]\n"
+ "fmla v4.4s, v25.4s, v10.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v5.4s, v22.4s, v8.4s\n"
+ "add x16, x16, #16\n"
+ "fmla v3.4s, v22.4s, v14.4s\n"
+ "ldr q15, [x19, x13]\n"
+ "fmla v2.4s, v23.4s, v7.4s\n"
+ "add x17, x17, #16\n"
+ "fmla v5.4s, v21.4s, v10.4s\n"
+ "ldr q21, [x18, x14]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr q23, [x20, x13]\n"
+ "str q2, [x22]\n"
+ "fmla v4.4s, v29.4s, v7.4s\n"
+ "fmla v3.4s, v29.4s, v9.4s\n"
+ "ldr q24, [x19, x14]\n"
+ "fmla v1.4s, v19.4s, v11.4s\n"
+ "ldr q25, [x20, x14]\n"
+ "str q4, [x21, %[output_col_stride1]]\n"
+ "fmla v0.4s, v29.4s, v6.4s\n"
+ "fmla v3.4s, v27.4s, v11.4s\n"
+ "ldr q27, [%[wbptr]]\n"
+ "fmla v1.4s, v29.4s, v13.4s\n"
+ "ldr q29, [%[inptr0]]\n"
+ "fmla v5.4s, v28.4s, v7.4s\n"
+ "ldr q6, [%[wbptr], #16]\n"
+ "fmla v3.4s, v28.4s, v13.4s\n"
+ "ldr q28, [x15]\n"
+ "fmla v1.4s, v26.4s, v8.4s\n"
+ "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
+ "str q5, [%[outptr0], x23]\n"
+ "fmla v0.4s, v20.4s, v12.4s\n"
+ "fmla v3.4s, v17.4s, v8.4s\n"
+ "ldr q22, [x16]\n"
+ "fmla v1.4s, v20.4s, v10.4s\n"
+ "ldr q20, [x15, %[input_col_stride1]]\n"
+ "fmla v0.4s, v17.4s, v14.4s\n"
+ "ldr q12, [%[wbptr], #64]\n"
+ "fmla v3.4s, v18.4s, v10.4s\n"
+ "ldr q19, [%[inptr0], x26]\n"
+ "fmla v1.4s, v16.4s, v7.4s\n"
+ "ldr q30, [x17]\n"
+ "fmla v0.4s, v16.4s, v9.4s\n"
+ "ldr q14, [%[wbptr], #32]\n"
+ "fmla v3.4s, v21.4s, v7.4s\n"
+ "ldr q18, [x16, %[input_col_stride1]]\n"
+ "str q1, [x22, %[output_col_stride1]]\n"
+ "mov v17.16b, v27.16b\n"
+ "fmla v0.4s, v15.4s, v11.4s\n"
+ "ldr q9, [%[wbptr], #112]\n"
+ "str q3, [x21, x23]\n"
+ "mov v16.16b, v27.16b\n"
+ "mov v15.16b, v27.16b\n"
+ "add x18, x18, #16\n"
+ "fmla v0.4s, v21.4s, v13.4s\n"
+ "ldr q11, [%[wbptr], #80]\n"
+ "mov v2.16b, v27.16b\n"
+ "add x19, x19, #16\n"
+ "mov v4.16b, v27.16b\n"
+ "add x20, x20, #16\n"
+ "fmla v0.4s, v23.4s, v8.4s\n"
+ "ldr q13, [%[wbptr], #48]\n"
+ "mov v5.16b, v27.16b\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "mov v1.16b, v27.16b\n"
+ "add x21, x21, #16\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "ldr q8, [%[wbptr], #128]\n"
+ "mov v3.16b, v27.16b\n"
+ "fmla v0.4s, v25.4s, v7.4s\n"
+ "ldr q10, [%[wbptr], #96]\n"
+ "str q0, [x22, x23]\n"
+ "mov v0.16b, v27.16b\n"
+ "ldr q7, [%[wbptr], #144]\n"
+ "add x22, x22, #16\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v17.4s, v29.4s, v6.4s\n"
+ "ldr q21, [x15, x26]\n"
+ "fmla v16.4s, v22.4s, v6.4s\n"
+ "ldr q27, [%[inptr0], x27]\n"
+ "fmla v15.4s, v19.4s, v6.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v17.4s, v28.4s, v12.4s\n"
+ "ldr q25, [x18]\n"
+ "fmla v16.4s, v30.4s, v12.4s\n"
+ "ldr q24, [x17, %[input_col_stride1]]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v17.4s, v26.4s, v14.4s\n"
+ "ldr q23, [x16, x26]\n"
+ "fmla v16.4s, v18.4s, v14.4s\n"
+ "fmla v2.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v27.4s, v14.4s\n"
+ "ldr q26, [x15, x27]\n"
+ "fmla v17.4s, v22.4s, v9.4s\n"
+ "ldr q22, [%[inptr0], x28]\n"
+ "fmla v16.4s, v25.4s, v9.4s\n"
+ "ldr q30, [x19]\n"
+ "fmla v15.4s, v23.4s, v9.4s\n"
+ "fmla v4.4s, v23.4s, v6.4s\n"
+ "fmla v17.4s, v20.4s, v11.4s\n"
+ "ldr q29, [x18, %[input_col_stride1]]\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "ldr q28, [x17, x26]\n"
+ "fmla v15.4s, v26.4s, v11.4s\n"
+ "ldr q24, [x16, x27]\n"
+ "fmla v17.4s, v19.4s, v13.4s\n"
+ "ldr q25, [x15, x28]\n"
+ "fmla v16.4s, v23.4s, v13.4s\n"
+ "fmla v5.4s, v22.4s, v6.4s\n"
+ "fmla v15.4s, v22.4s, v13.4s\n"
+ "ldr q19, [%[inptr0], x13]\n"
+ "fmla v17.4s, v18.4s, v8.4s\n"
+ "ldr q18, [x20]\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "ldr q22, [x19, %[input_col_stride1]]\n"
+ "fmla v16.4s, v29.4s, v8.4s\n"
+ "fmla v4.4s, v28.4s, v12.4s\n"
+ "fmla v17.4s, v21.4s, v10.4s\n"
+ "ldr q26, [x18, x26]\n"
+ "fmla v2.4s, v29.4s, v14.4s\n"
+ "ldr q20, [x17, x27]\n"
+ "fmla v16.4s, v28.4s, v10.4s\n"
+ "ldr q27, [x16, x28]\n"
+ "fmla v17.4s, v23.4s, v7.4s\n"
+ "ldr q30, [x15, x13]\n"
+ "fmla v15.4s, v24.4s, v8.4s\n"
+ "fmla v4.4s, v24.4s, v14.4s\n"
+ "fmla v5.4s, v25.4s, v12.4s\n"
+ "ldr q24, [%[inptr0], x14]\n"
+ "str q17, [%[outptr0]]\n"
+ "fmla v2.4s, v18.4s, v9.4s\n"
+ "fmla v15.4s, v25.4s, v10.4s\n"
+ "ldr q28, [x20, %[input_col_stride1]]\n"
+ "fmla v5.4s, v19.4s, v14.4s\n"
+ "ldr q17, [x19, x26]\n"
+ "fmla v2.4s, v22.4s, v11.4s\n"
+ "ldr q18, [x18, x27]\n"
+ "fmla v16.4s, v26.4s, v7.4s\n"
+ "ldr q25, [x17, x28]\n"
+ "fmla v4.4s, v26.4s, v9.4s\n"
+ "ldr q22, [x16, x13]\n"
+ "fmla v2.4s, v26.4s, v13.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "str q16, [x21]\n"
+ "fmla v1.4s, v26.4s, v6.4s\n"
+ "fmla v4.4s, v20.4s, v11.4s\n"
+ "ldr q21, [x15, x14]\n"
+ "fmla v15.4s, v27.4s, v7.4s\n"
+ "ldr q23, [x20, x26]\n"
+ "fmla v5.4s, v27.4s, v9.4s\n"
+ "ldr q19, [x19, x27]\n"
+ "fmla v4.4s, v27.4s, v13.4s\n"
+ "add x15, x15, #16\n"
+ "str q15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v3.4s, v27.4s, v6.4s\n"
+ "fmla v5.4s, v30.4s, v11.4s\n"
+ "ldr q29, [x18, x28]\n"
+ "fmla v2.4s, v28.4s, v8.4s\n"
+ "ldr q27, [x17, x13]\n"
+ "fmla v1.4s, v17.4s, v12.4s\n"
+ "ldr q28, [x16, x14]\n"
+ "fmla v5.4s, v24.4s, v13.4s\n"
+ "ldr q26, [x20, x27]\n"
+ "fmla v2.4s, v17.4s, v10.4s\n"
+ "ldr q20, [x19, x28]\n"
+ "fmla v4.4s, v18.4s, v8.4s\n"
+ "ldr q17, [x18, x13]\n"
+ "fmla v1.4s, v18.4s, v14.4s\n"
+ "ldr q18, [x17, x14]\n"
+ "fmla v3.4s, v25.4s, v12.4s\n"
+ "add x16, x16, #16\n"
+ "fmla v4.4s, v25.4s, v10.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v5.4s, v22.4s, v8.4s\n"
+ "add x17, x17, #16\n"
+ "fmla v3.4s, v22.4s, v14.4s\n"
+ "ldr q15, [x19, x13]\n"
+ "fmla v2.4s, v23.4s, v7.4s\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "fmla v5.4s, v21.4s, v10.4s\n"
+ "ldr q21, [x18, x14]\n"
+ "fmla v4.4s, v29.4s, v7.4s\n"
+ "ldr q23, [x20, x13]\n"
+ "str q2, [x22]\n"
+ "fmla v1.4s, v19.4s, v11.4s\n"
+ "fmla v3.4s, v29.4s, v9.4s\n"
+ "ldr q24, [x19, x14]\n"
+ "str q4, [x21, %[output_col_stride1]]\n"
+ "fmla v0.4s, v29.4s, v6.4s\n"
+ "fmla v1.4s, v29.4s, v13.4s\n"
+ "ldr q25, [x20, x14]\n"
+ "fmla v3.4s, v27.4s, v11.4s\n"
+ "add x18, x18, #16\n"
+ "fmla v5.4s, v28.4s, v7.4s\n"
+ "add x19, x19, #16\n"
+ "fmla v1.4s, v26.4s, v8.4s\n"
+ "add x20, x20, #16\n"
+ "fmla v3.4s, v28.4s, v13.4s\n"
+ "fmla v0.4s, v20.4s, v12.4s\n"
+ "str q5, [%[outptr0], x23]\n"
+ "fmla v1.4s, v20.4s, v10.4s\n"
+ "fmla v3.4s, v17.4s, v8.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v0.4s, v17.4s, v14.4s\n"
+ "fmla v1.4s, v16.4s, v7.4s\n"
+ "fmla v3.4s, v18.4s, v10.4s\n"
+ "fmla v0.4s, v16.4s, v9.4s\n"
+ "str q1, [x22, %[output_col_stride1]]\n"
+ "fmla v3.4s, v21.4s, v7.4s\n"
+ "fmla v0.4s, v15.4s, v11.4s\n"
+ "str q3, [x21, x23]\n"
+ "fmla v0.4s, v21.4s, v13.4s\n"
+ "add x21, x21, #16\n"
+ "fmla v0.4s, v23.4s, v8.4s\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v0.4s, v25.4s, v7.4s\n"
+ "str q0, [x22, x23]\n"
+ "add x22, x22, #16\n"
+ "4:\n"
+ "cbz x24, 7f\n"
+ "ldr s27, [%[wbptr]]\n"
+ "mov v17.16b, v27.16b\n"
+ "ldr s6, [%[wbptr], #4]\n"
+ "mov v16.16b, v27.16b\n"
+ "ldr s14, [%[wbptr], #8]\n"
+ "mov v15.16b, v27.16b\n"
+ "ldr s13, [%[wbptr], #12]\n"
+ "mov v2.16b, v27.16b\n"
+ "ldr s12, [%[wbptr], #16]\n"
+ "mov v4.16b, v27.16b\n"
+ "ldr s11, [%[wbptr], #20]\n"
+ "mov v5.16b, v27.16b\n"
+ "ldr s10, [%[wbptr], #24]\n"
+ "mov v1.16b, v27.16b\n"
+ "ldr s9, [%[wbptr], #28]\n"
+ "mov v3.16b, v27.16b\n"
+ "ldr s8, [%[wbptr], #32]\n"
+ "mov v0.16b, v27.16b\n"
+ "ldr s7, [%[wbptr], #36]\n"
+ "ldr s29, [%[inptr0]]\n"
+ "subs x24, x24, #1\n"
+ "ldr s28, [x15]\n"
+ "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr s22, [x16]\n"
+ "ldr s20, [x15, %[input_col_stride1]]\n"
+ "ldr s19, [%[inptr0], x26]\n"
+ "ldr s30, [x17]\n"
+ "ldr s18, [x16, %[input_col_stride1]]\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v17.4s, v29.4s, v6.4s\n"
+ "ldr s21, [x15, x26]\n"
+ "fmla v16.4s, v22.4s, v6.4s\n"
+ "ldr s27, [%[inptr0], x27]\n"
+ "fmla v15.4s, v19.4s, v6.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v17.4s, v28.4s, v12.4s\n"
+ "ldr s25, [x18]\n"
+ "fmla v16.4s, v30.4s, v12.4s\n"
+ "ldr s24, [x17, %[input_col_stride1]]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v17.4s, v26.4s, v14.4s\n"
+ "ldr s23, [x16, x26]\n"
+ "fmla v16.4s, v18.4s, v14.4s\n"
+ "subs x24, x24, #1\n"
+ "fmla v15.4s, v27.4s, v14.4s\n"
+ "ldr s26, [x15, x27]\n"
+ "fmla v17.4s, v22.4s, v9.4s\n"
+ "ldr s22, [%[inptr0], x28]\n"
+ "fmla v16.4s, v25.4s, v9.4s\n"
+ "fmla v2.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v23.4s, v9.4s\n"
+ "ldr s30, [x19]\n"
+ "fmla v17.4s, v20.4s, v11.4s\n"
+ "ldr s29, [x18, %[input_col_stride1]]\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "ldr s28, [x17, x26]\n"
+ "fmla v4.4s, v23.4s, v6.4s\n"
+ "fmla v15.4s, v26.4s, v11.4s\n"
+ "fmla v17.4s, v19.4s, v13.4s\n"
+ "ldr s24, [x16, x27]\n"
+ "fmla v16.4s, v23.4s, v13.4s\n"
+ "ldr s25, [x15, x28]\n"
+ "fmla v15.4s, v22.4s, v13.4s\n"
+ "fmla v5.4s, v22.4s, v6.4s\n"
+ "fmla v17.4s, v18.4s, v8.4s\n"
+ "ldr s19, [%[inptr0], x13]\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "ldr s18, [x20]\n"
+ "fmla v16.4s, v29.4s, v8.4s\n"
+ "ldr s22, [x19, %[input_col_stride1]]\n"
+ "fmla v17.4s, v21.4s, v10.4s\n"
+ "ldr s26, [x18, x26]\n"
+ "fmla v2.4s, v29.4s, v14.4s\n"
+ "ldr s20, [x17, x27]\n"
+ "fmla v16.4s, v28.4s, v10.4s\n"
+ "fmla v4.4s, v28.4s, v12.4s\n"
+ "fmla v17.4s, v23.4s, v7.4s\n"
+ "ldr s27, [x16, x28]\n"
+ "fmla v15.4s, v24.4s, v8.4s\n"
+ "ldr s30, [x15, x13]\n"
+ "fmla v4.4s, v24.4s, v14.4s\n"
+ "ldr s24, [%[inptr0], x14]\n"
+ "str s17, [%[outptr0]]\n"
+ "fmla v5.4s, v25.4s, v12.4s\n"
+ "fmla v15.4s, v25.4s, v10.4s\n"
+ "ldr s28, [x20, %[input_col_stride1]]\n"
+ "fmla v2.4s, v18.4s, v9.4s\n"
+ "ldr s17, [x19, x26]\n"
+ "fmla v5.4s, v19.4s, v14.4s\n"
+ "ldr s18, [x18, x27]\n"
+ "fmla v16.4s, v26.4s, v7.4s\n"
+ "ldr s25, [x17, x28]\n"
+ "fmla v2.4s, v22.4s, v11.4s\n"
+ "ldr s22, [x16, x13]\n"
+ "fmla v4.4s, v26.4s, v9.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "str s16, [x21]\n"
+ "fmla v1.4s, v26.4s, v6.4s\n"
+ "fmla v2.4s, v26.4s, v13.4s\n"
+ "ldr s21, [x15, x14]\n"
+ "fmla v4.4s, v20.4s, v11.4s\n"
+ "ldr s23, [x20, x26]\n"
+ "fmla v15.4s, v27.4s, v7.4s\n"
+ "ldr s19, [x19, x27]\n"
+ "fmla v5.4s, v27.4s, v9.4s\n"
+ "add x15, x15, #4\n"
+ "fmla v4.4s, v27.4s, v13.4s\n"
+ "fmla v3.4s, v27.4s, v6.4s\n"
+ "str s15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v2.4s, v28.4s, v8.4s\n"
+ "fmla v5.4s, v30.4s, v11.4s\n"
+ "ldr s29, [x18, x28]\n"
+ "fmla v1.4s, v17.4s, v12.4s\n"
+ "ldr s27, [x17, x13]\n"
+ "fmla v2.4s, v17.4s, v10.4s\n"
+ "ldr s28, [x16, x14]\n"
+ "fmla v5.4s, v24.4s, v13.4s\n"
+ "ldr s26, [x20, x27]\n"
+ "fmla v4.4s, v18.4s, v8.4s\n"
+ "ldr s20, [x19, x28]\n"
+ "fmla v1.4s, v18.4s, v14.4s\n"
+ "ldr s17, [x18, x13]\n"
+ "fmla v3.4s, v25.4s, v12.4s\n"
+ "ldr s18, [x17, x14]\n"
+ "fmla v4.4s, v25.4s, v10.4s\n"
+ "ldr s16, [x20, x28]\n"
+ "fmla v5.4s, v22.4s, v8.4s\n"
+ "add x16, x16, #4\n"
+ "fmla v3.4s, v22.4s, v14.4s\n"
+ "ldr s15, [x19, x13]\n"
+ "fmla v2.4s, v23.4s, v7.4s\n"
+ "add x17, x17, #4\n"
+ "fmla v5.4s, v21.4s, v10.4s\n"
+ "ldr s21, [x18, x14]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr s23, [x20, x13]\n"
+ "str s2, [x22]\n"
+ "fmla v4.4s, v29.4s, v7.4s\n"
+ "fmla v3.4s, v29.4s, v9.4s\n"
+ "ldr s24, [x19, x14]\n"
+ "fmla v1.4s, v19.4s, v11.4s\n"
+ "ldr s25, [x20, x14]\n"
+ "str s4, [x21, %[output_col_stride1]]\n"
+ "fmla v0.4s, v29.4s, v6.4s\n"
+ "fmla v3.4s, v27.4s, v11.4s\n"
+ "ldr s27, [%[wbptr]]\n"
+ "fmla v1.4s, v29.4s, v13.4s\n"
+ "ldr s29, [%[inptr0]]\n"
+ "fmla v5.4s, v28.4s, v7.4s\n"
+ "ldr s6, [%[wbptr], #4]\n"
+ "fmla v3.4s, v28.4s, v13.4s\n"
+ "ldr s28, [x15]\n"
+ "fmla v1.4s, v26.4s, v8.4s\n"
+ "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
+ "str s5, [%[outptr0], x23]\n"
+ "fmla v0.4s, v20.4s, v12.4s\n"
+ "fmla v3.4s, v17.4s, v8.4s\n"
+ "ldr s22, [x16]\n"
+ "fmla v1.4s, v20.4s, v10.4s\n"
+ "ldr s20, [x15, %[input_col_stride1]]\n"
+ "fmla v0.4s, v17.4s, v14.4s\n"
+ "ldr s12, [%[wbptr], #16]\n"
+ "fmla v3.4s, v18.4s, v10.4s\n"
+ "ldr s19, [%[inptr0], x26]\n"
+ "fmla v1.4s, v16.4s, v7.4s\n"
+ "ldr s30, [x17]\n"
+ "fmla v0.4s, v16.4s, v9.4s\n"
+ "ldr s14, [%[wbptr], #8]\n"
+ "fmla v3.4s, v21.4s, v7.4s\n"
+ "ldr s18, [x16, %[input_col_stride1]]\n"
+ "str s1, [x22, %[output_col_stride1]]\n"
+ "mov v17.16b, v27.16b\n"
+ "fmla v0.4s, v15.4s, v11.4s\n"
+ "ldr s9, [%[wbptr], #28]\n"
+ "str s3, [x21, x23]\n"
+ "mov v16.16b, v27.16b\n"
+ "mov v15.16b, v27.16b\n"
+ "add x18, x18, #4\n"
+ "fmla v0.4s, v21.4s, v13.4s\n"
+ "ldr s11, [%[wbptr], #20]\n"
+ "mov v2.16b, v27.16b\n"
+ "add x19, x19, #4\n"
+ "mov v4.16b, v27.16b\n"
+ "add x20, x20, #4\n"
+ "fmla v0.4s, v23.4s, v8.4s\n"
+ "ldr s13, [%[wbptr], #12]\n"
+ "mov v5.16b, v27.16b\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "mov v1.16b, v27.16b\n"
+ "add x21, x21, #4\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "ldr s8, [%[wbptr], #32]\n"
+ "mov v3.16b, v27.16b\n"
+ "fmla v0.4s, v25.4s, v7.4s\n"
+ "ldr s10, [%[wbptr], #24]\n"
+ "str s0, [x22, x23]\n"
+ "mov v0.16b, v27.16b\n"
+ "ldr s7, [%[wbptr], #36]\n"
+ "add x22, x22, #4\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v17.4s, v29.4s, v6.4s\n"
+ "ldr s21, [x15, x26]\n"
+ "fmla v16.4s, v22.4s, v6.4s\n"
+ "ldr s27, [%[inptr0], x27]\n"
+ "fmla v15.4s, v19.4s, v6.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v17.4s, v28.4s, v12.4s\n"
+ "ldr s25, [x18]\n"
+ "fmla v16.4s, v30.4s, v12.4s\n"
+ "ldr s24, [x17, %[input_col_stride1]]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v17.4s, v26.4s, v14.4s\n"
+ "ldr s23, [x16, x26]\n"
+ "fmla v16.4s, v18.4s, v14.4s\n"
+ "fmla v2.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v27.4s, v14.4s\n"
+ "ldr s26, [x15, x27]\n"
+ "fmla v17.4s, v22.4s, v9.4s\n"
+ "ldr s22, [%[inptr0], x28]\n"
+ "fmla v16.4s, v25.4s, v9.4s\n"
+ "ldr s30, [x19]\n"
+ "fmla v15.4s, v23.4s, v9.4s\n"
+ "fmla v4.4s, v23.4s, v6.4s\n"
+ "fmla v17.4s, v20.4s, v11.4s\n"
+ "ldr s29, [x18, %[input_col_stride1]]\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "ldr s28, [x17, x26]\n"
+ "fmla v15.4s, v26.4s, v11.4s\n"
+ "ldr s24, [x16, x27]\n"
+ "fmla v17.4s, v19.4s, v13.4s\n"
+ "ldr s25, [x15, x28]\n"
+ "fmla v16.4s, v23.4s, v13.4s\n"
+ "fmla v5.4s, v22.4s, v6.4s\n"
+ "fmla v15.4s, v22.4s, v13.4s\n"
+ "ldr s19, [%[inptr0], x13]\n"
+ "fmla v17.4s, v18.4s, v8.4s\n"
+ "ldr s18, [x20]\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "ldr s22, [x19, %[input_col_stride1]]\n"
+ "fmla v16.4s, v29.4s, v8.4s\n"
+ "fmla v4.4s, v28.4s, v12.4s\n"
+ "fmla v17.4s, v21.4s, v10.4s\n"
+ "ldr s26, [x18, x26]\n"
+ "fmla v2.4s, v29.4s, v14.4s\n"
+ "ldr s20, [x17, x27]\n"
+ "fmla v16.4s, v28.4s, v10.4s\n"
+ "ldr s27, [x16, x28]\n"
+ "fmla v17.4s, v23.4s, v7.4s\n"
+ "ldr s30, [x15, x13]\n"
+ "fmla v15.4s, v24.4s, v8.4s\n"
+ "fmla v4.4s, v24.4s, v14.4s\n"
+ "fmla v5.4s, v25.4s, v12.4s\n"
+ "ldr s24, [%[inptr0], x14]\n"
+ "str s17, [%[outptr0]]\n"
+ "fmla v2.4s, v18.4s, v9.4s\n"
+ "fmla v15.4s, v25.4s, v10.4s\n"
+ "ldr s28, [x20, %[input_col_stride1]]\n"
+ "fmla v5.4s, v19.4s, v14.4s\n"
+ "ldr s17, [x19, x26]\n"
+ "fmla v2.4s, v22.4s, v11.4s\n"
+ "ldr s18, [x18, x27]\n"
+ "fmla v16.4s, v26.4s, v7.4s\n"
+ "ldr s25, [x17, x28]\n"
+ "fmla v4.4s, v26.4s, v9.4s\n"
+ "ldr s22, [x16, x13]\n"
+ "fmla v2.4s, v26.4s, v13.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "str s16, [x21]\n"
+ "fmla v1.4s, v26.4s, v6.4s\n"
+ "fmla v4.4s, v20.4s, v11.4s\n"
+ "ldr s21, [x15, x14]\n"
+ "fmla v15.4s, v27.4s, v7.4s\n"
+ "ldr s23, [x20, x26]\n"
+ "fmla v5.4s, v27.4s, v9.4s\n"
+ "ldr s19, [x19, x27]\n"
+ "fmla v4.4s, v27.4s, v13.4s\n"
+ "add x15, x15, #4\n"
+ "str s15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v3.4s, v27.4s, v6.4s\n"
+ "fmla v5.4s, v30.4s, v11.4s\n"
+ "ldr s29, [x18, x28]\n"
+ "fmla v2.4s, v28.4s, v8.4s\n"
+ "ldr s27, [x17, x13]\n"
+ "fmla v1.4s, v17.4s, v12.4s\n"
+ "ldr s28, [x16, x14]\n"
+ "fmla v5.4s, v24.4s, v13.4s\n"
+ "ldr s26, [x20, x27]\n"
+ "fmla v2.4s, v17.4s, v10.4s\n"
+ "ldr s20, [x19, x28]\n"
+ "fmla v4.4s, v18.4s, v8.4s\n"
+ "ldr s17, [x18, x13]\n"
+ "fmla v1.4s, v18.4s, v14.4s\n"
+ "ldr s18, [x17, x14]\n"
+ "fmla v3.4s, v25.4s, v12.4s\n"
+ "add x16, x16, #4\n"
+ "fmla v4.4s, v25.4s, v10.4s\n"
+ "ldr s16, [x20, x28]\n"
+ "fmla v5.4s, v22.4s, v8.4s\n"
+ "add x17, x17, #4\n"
+ "fmla v3.4s, v22.4s, v14.4s\n"
+ "ldr s15, [x19, x13]\n"
+ "fmla v2.4s, v23.4s, v7.4s\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "fmla v5.4s, v21.4s, v10.4s\n"
+ "ldr s21, [x18, x14]\n"
+ "fmla v4.4s, v29.4s, v7.4s\n"
+ "ldr s23, [x20, x13]\n"
+ "str s2, [x22]\n"
+ "fmla v1.4s, v19.4s, v11.4s\n"
+ "fmla v3.4s, v29.4s, v9.4s\n"
+ "ldr s24, [x19, x14]\n"
+ "str s4, [x21, %[output_col_stride1]]\n"
+ "fmla v0.4s, v29.4s, v6.4s\n"
+ "fmla v1.4s, v29.4s, v13.4s\n"
+ "ldr s25, [x20, x14]\n"
+ "fmla v3.4s, v27.4s, v11.4s\n"
+ "add x18, x18, #4\n"
+ "fmla v5.4s, v28.4s, v7.4s\n"
+ "add x19, x19, #4\n"
+ "fmla v1.4s, v26.4s, v8.4s\n"
+ "add x20, x20, #4\n"
+ "fmla v3.4s, v28.4s, v13.4s\n"
+ "fmla v0.4s, v20.4s, v12.4s\n"
+ "str s5, [%[outptr0], x23]\n"
+ "fmla v1.4s, v20.4s, v10.4s\n"
+ "fmla v3.4s, v17.4s, v8.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v0.4s, v17.4s, v14.4s\n"
+ "fmla v1.4s, v16.4s, v7.4s\n"
+ "fmla v3.4s, v18.4s, v10.4s\n"
+ "fmla v0.4s, v16.4s, v9.4s\n"
+ "str s1, [x22, %[output_col_stride1]]\n"
+ "fmla v3.4s, v21.4s, v7.4s\n"
+ "fmla v0.4s, v15.4s, v11.4s\n"
+ "str s3, [x21, x23]\n"
+ "fmla v0.4s, v21.4s, v13.4s\n"
+ "add x21, x21, #4\n"
+ "fmla v0.4s, v23.4s, v8.4s\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v0.4s, v25.4s, v7.4s\n"
+ "str s0, [x22, x23]\n"
+ "add x22, x22, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr), [inptr0] "+r" (input), [outptr0] "+r" (output)
+ : [n_channels] "r" ((long long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory"
+ );
}
-
#endif // __aarch64__
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>;
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp
deleted file mode 100644
index 33b55df..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp16_fp16.hpp"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace depthwise
-{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
-} // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index c36c24e..a583615 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,1468 +25,5994 @@
namespace depthwise
{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float, float>;
+
+using namespace neon_convolution_kernels;
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
#ifdef __aarch64__
-
template <>
template <>
-void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
- const int n_channels,
- const float* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const float* const inptr,
- const int in_row_stride,
- const int in_col_stride,
- float* const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int, const int, const int, const int, const int, const int, const int, const int
+void Conv::execute_tile<ActivationFunction::None>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
)
{
- constexpr auto inner_tile_rows = DWC::inner_tile_rows;
- constexpr auto inner_tile_cols = DWC::inner_tile_cols;
- constexpr auto kernel_rows = DWC::kernel_rows;
- constexpr auto kernel_cols = DWC::kernel_cols;
- constexpr auto output_tile_rows = DWC::output_tile_rows;
- constexpr auto output_tile_cols = DWC::output_tile_cols;
- constexpr auto stride_rows = DWC::stride_rows;
- constexpr auto stride_cols = DWC::stride_cols;
-
- // Extract parameters
- const int in_pad_top = 0;
- const int in_pad_left = 0;
- const int in_pad_bottom = 0;
- const int in_pad_right = 0;
- const int out_pad_bottom = 0;
- const int out_pad_right = 0;
-
- // Compute valid ranges of the tile
- const int in_cells_i = inner_tile_rows - in_pad_bottom;
- const int in_cells_j = inner_tile_cols - in_pad_right;
- const int out_cells_i = output_tile_rows - out_pad_bottom;
- const int out_cells_j = output_tile_cols - out_pad_right;
-
- // Copy pointers
- const float *uptr0 = inptr;
- const float *wptr0 = weights;
- float *vptr0 = outptr;
- const bool same_strides = (
- weight_col_stride == in_col_stride &&
- weight_col_stride == out_col_stride
+ __asm __volatile(
+ "add x8, %[inptr0], %[input_row_stride]\n"
+ "add x15, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x23, %[outptr0], %[output_row_stride]\n"
+ "add x9, x8, %[input_row_stride]\n"
+ "add x16, x15, #64\n"
+ "add x17, x15, %[input_col_stride1]\n"
+ "add x10, x9, %[input_row_stride]\n"
+ "add x18, x17, #64\n"
+ "add x19, x17, %[input_col_stride1]\n"
+ "add x11, x10, %[input_row_stride]\n"
+ "add x20, x19, #64\n"
+ "add x21, x19, %[input_col_stride1]\n"
+ "add x12, x11, %[input_row_stride]\n"
+ "add x22, x21, #64\n"
+ "add x24, x23, %[output_row_stride]\n"
+ "add x25, x24, %[output_row_stride]\n"
+ "add x26, %[output_col_stride1], %[output_col_stride1]\n"
+ "and x13, %[n_channels], #3\n"
+ "add x27, x26, %[output_col_stride1]\n"
+ "lsr x14, %[n_channels], #2\n"
+ "cbz x14, 4f\n"
+ "1:\n"
+ "ldr q14, [%[wbptr]]\n"
+ "subs x14, x14, #1\n"
+ "mov v17.16b, v14.16b\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "mov v23.16b, v14.16b\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "mov v24.16b, v14.16b\n"
+ "ldr q10, [%[wbptr], #48]\n"
+ "mov v20.16b, v14.16b\n"
+ "ldr q9, [%[wbptr], #64]\n"
+ "mov v16.16b, v14.16b\n"
+ "ldr q8, [%[wbptr], #80]\n"
+ "mov v13.16b, v14.16b\n"
+ "ldr q7, [%[wbptr], #96]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr q6, [%[wbptr], #112]\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr q5, [%[wbptr], #128]\n"
+ "mov v2.16b, v14.16b\n"
+ "ldr q4, [%[wbptr], #144]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr q29, [%[inptr0]]\n"
+ "fmla v17.4s, v29.4s, v12.4s\n"
+ "ldr q28, [x8]\n"
+ "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
+ "ldr q25, [x9]\n"
+ "ldr q26, [x8, %[input_col_stride1]]\n"
+ "ldr q27, [%[inptr0], x15]\n"
+ "ldr q15, [x10]\n"
+ "ldr q18, [x9, %[input_col_stride1]]\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x8, #64]\n"
+ "prfm pldl1keep, [%[inptr0], x28]\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "prfm pldl1keep, [x8, x28]\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "prfm pldl1keep, [x10, #64]\n"
+ "prfm pldl1keep, [x9, x28]\n"
+ "beq 3f\n"
+ "2:\n"
+ "fmla v17.4s, v28.4s, v9.4s\n"
+ "prfm pldl1keep, [x8, x16]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr q22, [x8, x15]\n"
+ "fmla v24.4s, v30.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "ldr q29, [%[inptr0], x17]\n"
+ "fmla v23.4s, v25.4s, v9.4s\n"
+ "prfm pldl1keep, [x11, #64]\n"
+ "fmla v20.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x10, x28]\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "ldr q25, [x11]\n"
+ "fmla v23.4s, v26.4s, v11.4s\n"
+ "prfm pldl1keep, [x9, x16]\n"
+ "fmla v24.4s, v26.4s, v9.4s\n"
+ "prfm pldl1keep, [x8, x18]\n"
+ "fmla v17.4s, v26.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x20]\n"
+ "fmla v16.4s, v26.4s, v12.4s\n"
+ "ldr q28, [x10, %[input_col_stride1]]\n"
+ "fmla v24.4s, v27.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, #64]\n"
+ "fmla v17.4s, v27.4s, v10.4s\n"
+ "prfm pldl1keep, [x11, x28]\n"
+ "fmla v13.4s, v27.4s, v12.4s\n"
+ "ldr q19, [x9, x15]\n"
+ "fmla v23.4s, v15.4s, v6.4s\n"
+ "prfm pldl1keep, [x10, x16]\n"
+ "fmla v20.4s, v15.4s, v9.4s\n"
+ "prfm pldl1keep, [x9, x18]\n"
+ "fmla v0.4s, v15.4s, v12.4s\n"
+ "ldr q21, [x8, x17]\n"
+ "fmla v17.4s, v18.4s, v5.4s\n"
+ "prfm pldl1keep, [x8, x20]\n"
+ "fmla v23.4s, v18.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x22]\n"
+ "fmla v24.4s, v18.4s, v6.4s\n"
+ "prfm pldl1keep, [x12, x28]\n"
+ "fmla v20.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x11, x16]\n"
+ "fmla v16.4s, v18.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x18]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "ldr q27, [%[inptr0], x19]\n"
+ "fmla v17.4s, v22.4s, v7.4s\n"
+ "prfm pldl1keep, [x9, x20]\n"
+ "fmla v23.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [x8, x22]\n"
+ "fmla v24.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [x12, x16]\n"
+ "fmla v16.4s, v22.4s, v11.4s\n"
+ "prfm pldl1keep, [x11, x18]\n"
+ "fmla v13.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x20]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "ldr q18, [x12]\n"
+ "fmla v24.4s, v29.4s, v10.4s\n"
+ "prfm pldl1keep, [x9, x22]\n"
+ "fmla v13.4s, v29.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, x18]\n"
+ "fmla v3.4s, v29.4s, v12.4s\n"
+ "ldr q22, [x11, %[input_col_stride1]]\n"
+ "fmla v20.4s, v25.4s, v6.4s\n"
+ "prfm pldl1keep, [x11, x20]\n"
+ "fmla v0.4s, v25.4s, v9.4s\n"
+ "ldr q25, [x10, x15]\n"
+ "fmla v23.4s, v28.4s, v5.4s\n"
+ "prfm pldl1keep, [x10, x22]\n"
+ "fmla v20.4s, v28.4s, v8.4s\n"
+ "prfm pldl1keep, [x12, x20]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "prfm pldl1keep, [x11, x22]\n"
+ "fmla v0.4s, v28.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, x22]\n"
+ "fmla v1.4s, v28.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v17.4s, v19.4s, v4.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v23.4s, v19.4s, v7.4s\n"
+ "subs x14, x14, #1\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v20.4s, v19.4s, v10.4s\n"
+ "str q17, [%[outptr0]]\n"
+ "mov v15.16b, v14.16b\n"
+ "fmla v16.4s, v19.4s, v8.4s\n"
+ "fmla v13.4s, v19.4s, v6.4s\n"
+ "fmla v15.4s, v28.4s, v12.4s\n"
+ "ldr q29, [x9, x17]\n"
+ "fmla v1.4s, v19.4s, v11.4s\n"
+ "fmla v2.4s, v19.4s, v9.4s\n"
+ "fmla v24.4s, v21.4s, v7.4s\n"
+ "fmla v16.4s, v21.4s, v10.4s\n"
+ "fmla v13.4s, v21.4s, v8.4s\n"
+ "fmla v3.4s, v21.4s, v9.4s\n"
+ "fmla v2.4s, v21.4s, v11.4s\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "mov v18.16b, v14.16b\n"
+ "fmla v20.4s, v22.4s, v5.4s\n"
+ "fmla v13.4s, v27.4s, v10.4s\n"
+ "fmla v3.4s, v27.4s, v11.4s\n"
+ "mov v17.16b, v14.16b\n"
+ "fmla v18.4s, v19.4s, v12.4s\n"
+ "mov v19.16b, v14.16b\n"
+ "fmla v0.4s, v22.4s, v8.4s\n"
+ "fmla v17.4s, v21.4s, v12.4s\n"
+ "ldr q26, [x8, x19]\n"
+ "fmla v1.4s, v22.4s, v6.4s\n"
+ "fmla v15.4s, v22.4s, v9.4s\n"
+ "mov v22.16b, v14.16b\n"
+ "mov v21.16b, v14.16b\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "fmla v20.4s, v25.4s, v7.4s\n"
+ "fmla v16.4s, v25.4s, v5.4s\n"
+ "fmla v0.4s, v25.4s, v10.4s\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "fmla v2.4s, v25.4s, v6.4s\n"
+ "str q23, [x23]\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "ldr q28, [%[inptr0], x21]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr q30, [x12, %[input_col_stride1]]\n"
+ "fmla v24.4s, v29.4s, v4.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v16.4s, v29.4s, v7.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v13.4s, v29.4s, v5.4s\n"
+ "prfm pldl1keep, [%[inptr0], x28]\n"
+ "str q24, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v1.4s, v29.4s, v10.4s\n"
+ "fmla v2.4s, v29.4s, v8.4s\n"
+ "ldr q27, [x11, x15]\n"
+ "fmla v3.4s, v29.4s, v6.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "fmla v22.4s, v29.4s, v12.4s\n"
+ "ldr q23, [x10, x17]\n"
+ "fmla v13.4s, v26.4s, v7.4s\n"
+ "fmla v2.4s, v26.4s, v10.4s\n"
+ "fmla v3.4s, v26.4s, v8.4s\n"
+ "fmla v17.4s, v26.4s, v11.4s\n"
+ "fmla v0.4s, v30.4s, v5.4s\n"
+ "ldr q24, [x9, x19]\n"
+ "fmla v15.4s, v30.4s, v6.4s\n"
+ "ldr q29, [x8, x21]\n"
+ "fmla v3.4s, v28.4s, v10.4s\n"
+ "ldr q14, [x12, x15]\n"
+ "fmla v20.4s, v27.4s, v4.4s\n"
+ "add x8, x8, #16\n"
+ "fmla v0.4s, v27.4s, v7.4s\n"
+ "prfm pldl1keep, [x8, #64]\n"
+ "fmla v1.4s, v27.4s, v5.4s\n"
+ "prfm pldl1keep, [x8, x28]\n"
+ "str q20, [x24]\n"
+ "fmla v15.4s, v27.4s, v8.4s\n"
+ "fmla v18.4s, v27.4s, v6.4s\n"
+ "ldr q25, [x11, x17]\n"
+ "fmla v19.4s, v27.4s, v9.4s\n"
+ "ldr q30, [x10, x19]\n"
+ "fmla v16.4s, v23.4s, v4.4s\n"
+ "fmla v1.4s, v23.4s, v7.4s\n"
+ "fmla v2.4s, v23.4s, v5.4s\n"
+ "fmla v15.4s, v23.4s, v10.4s\n"
+ "fmla v18.4s, v23.4s, v8.4s\n"
+ "fmla v17.4s, v23.4s, v6.4s\n"
+ "str q16, [x23, %[output_col_stride1]]\n"
+ "fmla v19.4s, v23.4s, v11.4s\n"
+ "fmla v22.4s, v23.4s, v9.4s\n"
+ "ldr q26, [x9, x21]\n"
+ "fmla v21.4s, v23.4s, v12.4s\n"
+ "ldr q27, [x12, x17]\n"
+ "fmla v13.4s, v24.4s, v4.4s\n"
+ "ldr q20, [x11, x19]\n"
+ "fmla v2.4s, v24.4s, v7.4s\n"
+ "add x9, x9, #16\n"
+ "fmla v3.4s, v24.4s, v5.4s\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "str q13, [%[outptr0], x26]\n"
+ "fmla v18.4s, v24.4s, v10.4s\n"
+ "fmla v17.4s, v24.4s, v8.4s\n"
+ "ldr q23, [x10, x21]\n"
+ "fmla v22.4s, v24.4s, v11.4s\n"
+ "ldr q24, [x12, x19]\n"
+ "fmla v3.4s, v29.4s, v7.4s\n"
+ "prfm pldl1keep, [x9, x28]\n"
+ "fmla v17.4s, v29.4s, v10.4s\n"
+ "ldr q16, [x11, x21]\n"
+ "fmla v0.4s, v14.4s, v4.4s\n"
+ "add x10, x10, #16\n"
+ "fmla v15.4s, v14.4s, v5.4s\n"
+ "prfm pldl1keep, [x10, #64]\n"
+ "fmla v19.4s, v14.4s, v6.4s\n"
+ "ldr q13, [x12, x21]\n"
+ "str q0, [x25]\n"
+ "fmla v1.4s, v25.4s, v4.4s\n"
+ "fmla v15.4s, v25.4s, v7.4s\n"
+ "ldr q14, [%[wbptr]]\n"
+ "fmla v18.4s, v25.4s, v5.4s\n"
+ "add x11, x11, #16\n"
+ "str q1, [x24, %[output_col_stride1]]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "fmla v22.4s, v25.4s, v6.4s\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "fmla v21.4s, v25.4s, v9.4s\n"
+ "ldr q29, [%[inptr0]]\n"
+ "fmla v2.4s, v30.4s, v4.4s\n"
+ "ldr q28, [x8]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "add x12, x12, #16\n"
+ "fmla v17.4s, v30.4s, v5.4s\n"
+ "fmla v19.4s, v30.4s, v10.4s\n"
+ "str q2, [x23, x26]\n"
+ "fmla v22.4s, v30.4s, v8.4s\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr q9, [%[wbptr], #64]\n"
+ "fmla v3.4s, v26.4s, v4.4s\n"
+ "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v17.4s, v26.4s, v7.4s\n"
+ "ldr q25, [x9]\n"
+ "fmla v22.4s, v26.4s, v10.4s\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "str q3, [%[outptr0], x27]\n"
+ "fmla v15.4s, v27.4s, v4.4s\n"
+ "fmla v19.4s, v27.4s, v5.4s\n"
+ "ldr q26, [x8, %[input_col_stride1]]\n"
+ "fmla v21.4s, v27.4s, v6.4s\n"
+ "ldr q27, [%[inptr0], x15]\n"
+ "str q15, [x25, %[output_col_stride1]]\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v19.4s, v20.4s, v7.4s\n"
+ "ldr q15, [x10]\n"
+ "fmla v22.4s, v20.4s, v5.4s\n"
+ "ldr q6, [%[wbptr], #112]\n"
+ "str q18, [x24, x26]\n"
+ "fmla v21.4s, v20.4s, v8.4s\n"
+ "fmla v17.4s, v23.4s, v4.4s\n"
+ "ldr q18, [x9, %[input_col_stride1]]\n"
+ "fmla v22.4s, v23.4s, v7.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v21.4s, v23.4s, v10.4s\n"
+ "ldr q8, [%[wbptr], #80]\n"
+ "str q17, [x23, x27]\n"
+ "fmla v19.4s, v24.4s, v4.4s\n"
+ "fmla v22.4s, v16.4s, v4.4s\n"
+ "add x23, x23, #16\n"
+ "fmla v21.4s, v24.4s, v5.4s\n"
+ "ldr q10, [%[wbptr], #48]\n"
+ "str q19, [x25, x26]\n"
+ "mov v17.16b, v14.16b\n"
+ "str q22, [x24, x27]\n"
+ "mov v23.16b, v14.16b\n"
+ "fmla v21.4s, v16.4s, v7.4s\n"
+ "ldr q5, [%[wbptr], #128]\n"
+ "mov v24.16b, v14.16b\n"
+ "add x24, x24, #16\n"
+ "mov v20.16b, v14.16b\n"
+ "mov v16.16b, v14.16b\n"
+ "fmla v21.4s, v13.4s, v4.4s\n"
+ "ldr q7, [%[wbptr], #96]\n"
+ "mov v13.16b, v14.16b\n"
+ "mov v0.16b, v14.16b\n"
+ "mov v1.16b, v14.16b\n"
+ "mov v2.16b, v14.16b\n"
+ "str q21, [x25, x27]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr q4, [%[wbptr], #144]\n"
+ "add x25, x25, #16\n"
+ "fmla v17.4s, v29.4s, v12.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fmla v17.4s, v28.4s, v9.4s\n"
+ "prfm pldl1keep, [x8, x16]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr q22, [x8, x15]\n"
+ "fmla v24.4s, v30.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "ldr q29, [%[inptr0], x17]\n"
+ "fmla v23.4s, v25.4s, v9.4s\n"
+ "prfm pldl1keep, [x11, #64]\n"
+ "fmla v20.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x10, x28]\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "ldr q25, [x11]\n"
+ "fmla v23.4s, v26.4s, v11.4s\n"
+ "prfm pldl1keep, [x9, x16]\n"
+ "fmla v24.4s, v26.4s, v9.4s\n"
+ "prfm pldl1keep, [x8, x18]\n"
+ "fmla v17.4s, v26.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x20]\n"
+ "fmla v16.4s, v26.4s, v12.4s\n"
+ "ldr q28, [x10, %[input_col_stride1]]\n"
+ "fmla v24.4s, v27.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, #64]\n"
+ "fmla v17.4s, v27.4s, v10.4s\n"
+ "prfm pldl1keep, [x11, x28]\n"
+ "fmla v13.4s, v27.4s, v12.4s\n"
+ "ldr q19, [x9, x15]\n"
+ "fmla v23.4s, v15.4s, v6.4s\n"
+ "prfm pldl1keep, [x10, x16]\n"
+ "fmla v20.4s, v15.4s, v9.4s\n"
+ "prfm pldl1keep, [x9, x18]\n"
+ "fmla v0.4s, v15.4s, v12.4s\n"
+ "ldr q21, [x8, x17]\n"
+ "fmla v17.4s, v18.4s, v5.4s\n"
+ "prfm pldl1keep, [x8, x20]\n"
+ "fmla v23.4s, v18.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x22]\n"
+ "fmla v24.4s, v18.4s, v6.4s\n"
+ "prfm pldl1keep, [x12, x28]\n"
+ "fmla v20.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x11, x16]\n"
+ "fmla v16.4s, v18.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x18]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "ldr q27, [%[inptr0], x19]\n"
+ "fmla v17.4s, v22.4s, v7.4s\n"
+ "prfm pldl1keep, [x9, x20]\n"
+ "fmla v23.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [x8, x22]\n"
+ "fmla v24.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [x12, x16]\n"
+ "fmla v16.4s, v22.4s, v11.4s\n"
+ "prfm pldl1keep, [x11, x18]\n"
+ "fmla v13.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x20]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "ldr q18, [x12]\n"
+ "fmla v24.4s, v29.4s, v10.4s\n"
+ "prfm pldl1keep, [x9, x22]\n"
+ "fmla v13.4s, v29.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, x18]\n"
+ "fmla v3.4s, v29.4s, v12.4s\n"
+ "ldr q22, [x11, %[input_col_stride1]]\n"
+ "fmla v20.4s, v25.4s, v6.4s\n"
+ "prfm pldl1keep, [x11, x20]\n"
+ "fmla v0.4s, v25.4s, v9.4s\n"
+ "ldr q25, [x10, x15]\n"
+ "fmla v23.4s, v28.4s, v5.4s\n"
+ "prfm pldl1keep, [x10, x22]\n"
+ "fmla v20.4s, v28.4s, v8.4s\n"
+ "prfm pldl1keep, [x12, x20]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "prfm pldl1keep, [x11, x22]\n"
+ "fmla v0.4s, v28.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, x22]\n"
+ "fmla v1.4s, v28.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v17.4s, v19.4s, v4.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v23.4s, v19.4s, v7.4s\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v20.4s, v19.4s, v10.4s\n"
+ "fmla v16.4s, v19.4s, v8.4s\n"
+ "str q17, [%[outptr0]]\n"
+ "mov v15.16b, v14.16b\n"
+ "fmla v13.4s, v19.4s, v6.4s\n"
+ "fmla v1.4s, v19.4s, v11.4s\n"
+ "fmla v15.4s, v28.4s, v12.4s\n"
+ "ldr q29, [x9, x17]\n"
+ "fmla v2.4s, v19.4s, v9.4s\n"
+ "fmla v24.4s, v21.4s, v7.4s\n"
+ "fmla v16.4s, v21.4s, v10.4s\n"
+ "fmla v13.4s, v21.4s, v8.4s\n"
+ "fmla v3.4s, v21.4s, v9.4s\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "mov v18.16b, v14.16b\n"
+ "fmla v2.4s, v21.4s, v11.4s\n"
+ "fmla v13.4s, v27.4s, v10.4s\n"
+ "fmla v20.4s, v22.4s, v5.4s\n"
+ "fmla v18.4s, v19.4s, v12.4s\n"
+ "ldr q26, [x8, x19]\n"
+ "fmla v3.4s, v27.4s, v11.4s\n"
+ "ldr q28, [%[inptr0], x21]\n"
+ "fmla v0.4s, v22.4s, v8.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v1.4s, v22.4s, v6.4s\n"
+ "fmla v15.4s, v22.4s, v9.4s\n"
+ "mov v17.16b, v14.16b\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "fmla v20.4s, v25.4s, v7.4s\n"
+ "fmla v16.4s, v25.4s, v5.4s\n"
+ "fmla v17.4s, v21.4s, v12.4s\n"
+ "ldr q30, [x12, %[input_col_stride1]]\n"
+ "str q23, [x23]\n"
+ "mov v19.16b, v14.16b\n"
+ "fmla v0.4s, v25.4s, v10.4s\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "fmla v2.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "mov v22.16b, v14.16b\n"
+ "mov v21.16b, v14.16b\n"
+ "fmla v24.4s, v29.4s, v4.4s\n"
+ "fmla v16.4s, v29.4s, v7.4s\n"
+ "fmla v13.4s, v29.4s, v5.4s\n"
+ "fmla v1.4s, v29.4s, v10.4s\n"
+ "fmla v2.4s, v29.4s, v8.4s\n"
+ "fmla v3.4s, v29.4s, v6.4s\n"
+ "str q24, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "ldr q27, [x11, x15]\n"
+ "fmla v22.4s, v29.4s, v12.4s\n"
+ "ldr q23, [x10, x17]\n"
+ "fmla v13.4s, v26.4s, v7.4s\n"
+ "fmla v2.4s, v26.4s, v10.4s\n"
+ "fmla v3.4s, v26.4s, v8.4s\n"
+ "fmla v17.4s, v26.4s, v11.4s\n"
+ "fmla v0.4s, v30.4s, v5.4s\n"
+ "ldr q24, [x9, x19]\n"
+ "fmla v15.4s, v30.4s, v6.4s\n"
+ "ldr q29, [x8, x21]\n"
+ "fmla v3.4s, v28.4s, v10.4s\n"
+ "ldr q14, [x12, x15]\n"
+ "fmla v20.4s, v27.4s, v4.4s\n"
+ "add x8, x8, #16\n"
+ "fmla v0.4s, v27.4s, v7.4s\n"
+ "fmla v1.4s, v27.4s, v5.4s\n"
+ "fmla v15.4s, v27.4s, v8.4s\n"
+ "fmla v18.4s, v27.4s, v6.4s\n"
+ "str q20, [x24]\n"
+ "fmla v19.4s, v27.4s, v9.4s\n"
+ "fmla v16.4s, v23.4s, v4.4s\n"
+ "ldr q25, [x11, x17]\n"
+ "fmla v1.4s, v23.4s, v7.4s\n"
+ "ldr q30, [x10, x19]\n"
+ "fmla v2.4s, v23.4s, v5.4s\n"
+ "fmla v15.4s, v23.4s, v10.4s\n"
+ "str q16, [x23, %[output_col_stride1]]\n"
+ "fmla v18.4s, v23.4s, v8.4s\n"
+ "fmla v17.4s, v23.4s, v6.4s\n"
+ "ldr q26, [x9, x21]\n"
+ "fmla v19.4s, v23.4s, v11.4s\n"
+ "add x9, x9, #16\n"
+ "fmla v22.4s, v23.4s, v9.4s\n"
+ "fmla v21.4s, v23.4s, v12.4s\n"
+ "fmla v13.4s, v24.4s, v4.4s\n"
+ "ldr q27, [x12, x17]\n"
+ "fmla v2.4s, v24.4s, v7.4s\n"
+ "ldr q20, [x11, x19]\n"
+ "fmla v3.4s, v24.4s, v5.4s\n"
+ "fmla v18.4s, v24.4s, v10.4s\n"
+ "str q13, [%[outptr0], x26]\n"
+ "fmla v17.4s, v24.4s, v8.4s\n"
+ "fmla v22.4s, v24.4s, v11.4s\n"
+ "ldr q23, [x10, x21]\n"
+ "fmla v3.4s, v29.4s, v7.4s\n"
+ "ldr q24, [x12, x19]\n"
+ "fmla v17.4s, v29.4s, v10.4s\n"
+ "ldr q16, [x11, x21]\n"
+ "fmla v0.4s, v14.4s, v4.4s\n"
+ "add x10, x10, #16\n"
+ "fmla v15.4s, v14.4s, v5.4s\n"
+ "add x11, x11, #16\n"
+ "fmla v19.4s, v14.4s, v6.4s\n"
+ "ldr q13, [x12, x21]\n"
+ "str q0, [x25]\n"
+ "fmla v1.4s, v25.4s, v4.4s\n"
+ "fmla v15.4s, v25.4s, v7.4s\n"
+ "add x12, x12, #16\n"
+ "fmla v18.4s, v25.4s, v5.4s\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "str q1, [x24, %[output_col_stride1]]\n"
+ "fmla v22.4s, v25.4s, v6.4s\n"
+ "fmla v21.4s, v25.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v4.4s\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "fmla v17.4s, v30.4s, v5.4s\n"
+ "fmla v19.4s, v30.4s, v10.4s\n"
+ "fmla v22.4s, v30.4s, v8.4s\n"
+ "str q2, [x23, x26]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "fmla v3.4s, v26.4s, v4.4s\n"
+ "fmla v17.4s, v26.4s, v7.4s\n"
+ "fmla v22.4s, v26.4s, v10.4s\n"
+ "fmla v15.4s, v27.4s, v4.4s\n"
+ "fmla v19.4s, v27.4s, v5.4s\n"
+ "fmla v21.4s, v27.4s, v6.4s\n"
+ "str q3, [%[outptr0], x27]\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "str q15, [x25, %[output_col_stride1]]\n"
+ "fmla v22.4s, v20.4s, v5.4s\n"
+ "fmla v19.4s, v20.4s, v7.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "str q18, [x24, x26]\n"
+ "fmla v21.4s, v20.4s, v8.4s\n"
+ "fmla v17.4s, v23.4s, v4.4s\n"
+ "fmla v22.4s, v23.4s, v7.4s\n"
+ "fmla v19.4s, v24.4s, v4.4s\n"
+ "fmla v21.4s, v23.4s, v10.4s\n"
+ "str q17, [x23, x27]\n"
+ "fmla v22.4s, v16.4s, v4.4s\n"
+ "str q19, [x25, x26]\n"
+ "add x23, x23, #16\n"
+ "fmla v21.4s, v24.4s, v5.4s\n"
+ "str q22, [x24, x27]\n"
+ "add x24, x24, #16\n"
+ "fmla v21.4s, v16.4s, v7.4s\n"
+ "fmla v21.4s, v13.4s, v4.4s\n"
+ "str q21, [x25, x27]\n"
+ "add x25, x25, #16\n"
+ "4:\n"
+ "cbz x13, 7f\n"
+ "ldr s14, [%[wbptr]]\n"
+ "mov v17.16b, v14.16b\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "mov v23.16b, v14.16b\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "mov v24.16b, v14.16b\n"
+ "ldr s10, [%[wbptr], #12]\n"
+ "mov v20.16b, v14.16b\n"
+ "ldr s9, [%[wbptr], #16]\n"
+ "mov v16.16b, v14.16b\n"
+ "ldr s8, [%[wbptr], #20]\n"
+ "mov v13.16b, v14.16b\n"
+ "ldr s7, [%[wbptr], #24]\n"
+ "mov v0.16b, v14.16b\n"
+ "ldr s6, [%[wbptr], #28]\n"
+ "mov v1.16b, v14.16b\n"
+ "ldr s5, [%[wbptr], #32]\n"
+ "mov v2.16b, v14.16b\n"
+ "ldr s4, [%[wbptr], #36]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr s29, [%[inptr0]]\n"
+ "fmla v17.4s, v29.4s, v12.4s\n"
+ "ldr s28, [x8]\n"
+ "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
+ "subs x13, x13, #1\n"
+ "ldr s25, [x9]\n"
+ "ldr s26, [x8, %[input_col_stride1]]\n"
+ "ldr s27, [%[inptr0], x15]\n"
+ "ldr s15, [x10]\n"
+ "ldr s18, [x9, %[input_col_stride1]]\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x8, #64]\n"
+ "prfm pldl1keep, [%[inptr0], x28]\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "prfm pldl1keep, [x8, x28]\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "prfm pldl1keep, [x10, #64]\n"
+ "prfm pldl1keep, [x9, x28]\n"
+ "beq 6f\n"
+ "5:\n"
+ "fmla v17.4s, v28.4s, v9.4s\n"
+ "prfm pldl1keep, [x8, x16]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr s22, [x8, x15]\n"
+ "fmla v24.4s, v30.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "ldr s29, [%[inptr0], x17]\n"
+ "fmla v23.4s, v25.4s, v9.4s\n"
+ "prfm pldl1keep, [x11, #64]\n"
+ "fmla v20.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x10, x28]\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "ldr s25, [x11]\n"
+ "fmla v23.4s, v26.4s, v11.4s\n"
+ "prfm pldl1keep, [x9, x16]\n"
+ "fmla v24.4s, v26.4s, v9.4s\n"
+ "prfm pldl1keep, [x8, x18]\n"
+ "fmla v17.4s, v26.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x20]\n"
+ "fmla v16.4s, v26.4s, v12.4s\n"
+ "ldr s28, [x10, %[input_col_stride1]]\n"
+ "fmla v24.4s, v27.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, #64]\n"
+ "fmla v17.4s, v27.4s, v10.4s\n"
+ "prfm pldl1keep, [x11, x28]\n"
+ "fmla v13.4s, v27.4s, v12.4s\n"
+ "ldr s19, [x9, x15]\n"
+ "fmla v23.4s, v15.4s, v6.4s\n"
+ "prfm pldl1keep, [x10, x16]\n"
+ "fmla v20.4s, v15.4s, v9.4s\n"
+ "prfm pldl1keep, [x9, x18]\n"
+ "fmla v0.4s, v15.4s, v12.4s\n"
+ "ldr s21, [x8, x17]\n"
+ "fmla v17.4s, v18.4s, v5.4s\n"
+ "prfm pldl1keep, [x8, x20]\n"
+ "fmla v23.4s, v18.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x22]\n"
+ "fmla v24.4s, v18.4s, v6.4s\n"
+ "prfm pldl1keep, [x12, x28]\n"
+ "fmla v20.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x11, x16]\n"
+ "fmla v16.4s, v18.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x18]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "ldr s27, [%[inptr0], x19]\n"
+ "fmla v17.4s, v22.4s, v7.4s\n"
+ "prfm pldl1keep, [x9, x20]\n"
+ "fmla v23.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [x8, x22]\n"
+ "fmla v24.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [x12, x16]\n"
+ "fmla v16.4s, v22.4s, v11.4s\n"
+ "prfm pldl1keep, [x11, x18]\n"
+ "fmla v13.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x20]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "ldr s18, [x12]\n"
+ "fmla v24.4s, v29.4s, v10.4s\n"
+ "prfm pldl1keep, [x9, x22]\n"
+ "fmla v13.4s, v29.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, x18]\n"
+ "fmla v3.4s, v29.4s, v12.4s\n"
+ "ldr s22, [x11, %[input_col_stride1]]\n"
+ "fmla v20.4s, v25.4s, v6.4s\n"
+ "prfm pldl1keep, [x11, x20]\n"
+ "fmla v0.4s, v25.4s, v9.4s\n"
+ "ldr s25, [x10, x15]\n"
+ "fmla v23.4s, v28.4s, v5.4s\n"
+ "prfm pldl1keep, [x10, x22]\n"
+ "fmla v20.4s, v28.4s, v8.4s\n"
+ "prfm pldl1keep, [x12, x20]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "prfm pldl1keep, [x11, x22]\n"
+ "fmla v0.4s, v28.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, x22]\n"
+ "fmla v1.4s, v28.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v17.4s, v19.4s, v4.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v23.4s, v19.4s, v7.4s\n"
+ "subs x13, x13, #1\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v20.4s, v19.4s, v10.4s\n"
+ "str s17, [%[outptr0]]\n"
+ "mov v15.16b, v14.16b\n"
+ "fmla v16.4s, v19.4s, v8.4s\n"
+ "fmla v13.4s, v19.4s, v6.4s\n"
+ "fmla v15.4s, v28.4s, v12.4s\n"
+ "ldr s29, [x9, x17]\n"
+ "fmla v1.4s, v19.4s, v11.4s\n"
+ "fmla v2.4s, v19.4s, v9.4s\n"
+ "fmla v24.4s, v21.4s, v7.4s\n"
+ "fmla v16.4s, v21.4s, v10.4s\n"
+ "fmla v13.4s, v21.4s, v8.4s\n"
+ "fmla v3.4s, v21.4s, v9.4s\n"
+ "fmla v2.4s, v21.4s, v11.4s\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "mov v18.16b, v14.16b\n"
+ "fmla v20.4s, v22.4s, v5.4s\n"
+ "fmla v13.4s, v27.4s, v10.4s\n"
+ "fmla v3.4s, v27.4s, v11.4s\n"
+ "mov v17.16b, v14.16b\n"
+ "fmla v18.4s, v19.4s, v12.4s\n"
+ "mov v19.16b, v14.16b\n"
+ "fmla v0.4s, v22.4s, v8.4s\n"
+ "fmla v17.4s, v21.4s, v12.4s\n"
+ "ldr s26, [x8, x19]\n"
+ "fmla v1.4s, v22.4s, v6.4s\n"
+ "fmla v15.4s, v22.4s, v9.4s\n"
+ "mov v22.16b, v14.16b\n"
+ "mov v21.16b, v14.16b\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "fmla v20.4s, v25.4s, v7.4s\n"
+ "fmla v16.4s, v25.4s, v5.4s\n"
+ "fmla v0.4s, v25.4s, v10.4s\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "fmla v2.4s, v25.4s, v6.4s\n"
+ "str s23, [x23]\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "ldr s28, [%[inptr0], x21]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr s30, [x12, %[input_col_stride1]]\n"
+ "fmla v24.4s, v29.4s, v4.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v16.4s, v29.4s, v7.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v13.4s, v29.4s, v5.4s\n"
+ "prfm pldl1keep, [%[inptr0], x28]\n"
+ "str s24, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v1.4s, v29.4s, v10.4s\n"
+ "fmla v2.4s, v29.4s, v8.4s\n"
+ "ldr s27, [x11, x15]\n"
+ "fmla v3.4s, v29.4s, v6.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "fmla v22.4s, v29.4s, v12.4s\n"
+ "ldr s23, [x10, x17]\n"
+ "fmla v13.4s, v26.4s, v7.4s\n"
+ "fmla v2.4s, v26.4s, v10.4s\n"
+ "fmla v3.4s, v26.4s, v8.4s\n"
+ "fmla v17.4s, v26.4s, v11.4s\n"
+ "fmla v0.4s, v30.4s, v5.4s\n"
+ "ldr s24, [x9, x19]\n"
+ "fmla v15.4s, v30.4s, v6.4s\n"
+ "ldr s29, [x8, x21]\n"
+ "fmla v3.4s, v28.4s, v10.4s\n"
+ "ldr s14, [x12, x15]\n"
+ "fmla v20.4s, v27.4s, v4.4s\n"
+ "add x8, x8, #4\n"
+ "fmla v0.4s, v27.4s, v7.4s\n"
+ "prfm pldl1keep, [x8, #64]\n"
+ "fmla v1.4s, v27.4s, v5.4s\n"
+ "prfm pldl1keep, [x8, x28]\n"
+ "str s20, [x24]\n"
+ "fmla v15.4s, v27.4s, v8.4s\n"
+ "fmla v18.4s, v27.4s, v6.4s\n"
+ "ldr s25, [x11, x17]\n"
+ "fmla v19.4s, v27.4s, v9.4s\n"
+ "ldr s30, [x10, x19]\n"
+ "fmla v16.4s, v23.4s, v4.4s\n"
+ "fmla v1.4s, v23.4s, v7.4s\n"
+ "fmla v2.4s, v23.4s, v5.4s\n"
+ "fmla v15.4s, v23.4s, v10.4s\n"
+ "fmla v18.4s, v23.4s, v8.4s\n"
+ "fmla v17.4s, v23.4s, v6.4s\n"
+ "str s16, [x23, %[output_col_stride1]]\n"
+ "fmla v19.4s, v23.4s, v11.4s\n"
+ "fmla v22.4s, v23.4s, v9.4s\n"
+ "ldr s26, [x9, x21]\n"
+ "fmla v21.4s, v23.4s, v12.4s\n"
+ "ldr s27, [x12, x17]\n"
+ "fmla v13.4s, v24.4s, v4.4s\n"
+ "ldr s20, [x11, x19]\n"
+ "fmla v2.4s, v24.4s, v7.4s\n"
+ "add x9, x9, #4\n"
+ "fmla v3.4s, v24.4s, v5.4s\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "str s13, [%[outptr0], x26]\n"
+ "fmla v18.4s, v24.4s, v10.4s\n"
+ "fmla v17.4s, v24.4s, v8.4s\n"
+ "ldr s23, [x10, x21]\n"
+ "fmla v22.4s, v24.4s, v11.4s\n"
+ "ldr s24, [x12, x19]\n"
+ "fmla v3.4s, v29.4s, v7.4s\n"
+ "prfm pldl1keep, [x9, x28]\n"
+ "fmla v17.4s, v29.4s, v10.4s\n"
+ "ldr s16, [x11, x21]\n"
+ "fmla v0.4s, v14.4s, v4.4s\n"
+ "add x10, x10, #4\n"
+ "fmla v15.4s, v14.4s, v5.4s\n"
+ "prfm pldl1keep, [x10, #64]\n"
+ "fmla v19.4s, v14.4s, v6.4s\n"
+ "ldr s13, [x12, x21]\n"
+ "str s0, [x25]\n"
+ "fmla v1.4s, v25.4s, v4.4s\n"
+ "fmla v15.4s, v25.4s, v7.4s\n"
+ "ldr s14, [%[wbptr]]\n"
+ "fmla v18.4s, v25.4s, v5.4s\n"
+ "add x11, x11, #4\n"
+ "str s1, [x24, %[output_col_stride1]]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "fmla v22.4s, v25.4s, v6.4s\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "fmla v21.4s, v25.4s, v9.4s\n"
+ "ldr s29, [%[inptr0]]\n"
+ "fmla v2.4s, v30.4s, v4.4s\n"
+ "ldr s28, [x8]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "add x12, x12, #4\n"
+ "fmla v17.4s, v30.4s, v5.4s\n"
+ "fmla v19.4s, v30.4s, v10.4s\n"
+ "str s2, [x23, x26]\n"
+ "fmla v22.4s, v30.4s, v8.4s\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr s9, [%[wbptr], #16]\n"
+ "fmla v3.4s, v26.4s, v4.4s\n"
+ "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v17.4s, v26.4s, v7.4s\n"
+ "ldr s25, [x9]\n"
+ "fmla v22.4s, v26.4s, v10.4s\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "str s3, [%[outptr0], x27]\n"
+ "fmla v15.4s, v27.4s, v4.4s\n"
+ "fmla v19.4s, v27.4s, v5.4s\n"
+ "ldr s26, [x8, %[input_col_stride1]]\n"
+ "fmla v21.4s, v27.4s, v6.4s\n"
+ "ldr s27, [%[inptr0], x15]\n"
+ "str s15, [x25, %[output_col_stride1]]\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v19.4s, v20.4s, v7.4s\n"
+ "ldr s15, [x10]\n"
+ "fmla v22.4s, v20.4s, v5.4s\n"
+ "ldr s6, [%[wbptr], #28]\n"
+ "str s18, [x24, x26]\n"
+ "fmla v21.4s, v20.4s, v8.4s\n"
+ "fmla v17.4s, v23.4s, v4.4s\n"
+ "ldr s18, [x9, %[input_col_stride1]]\n"
+ "fmla v22.4s, v23.4s, v7.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v21.4s, v23.4s, v10.4s\n"
+ "ldr s8, [%[wbptr], #20]\n"
+ "str s17, [x23, x27]\n"
+ "fmla v19.4s, v24.4s, v4.4s\n"
+ "fmla v22.4s, v16.4s, v4.4s\n"
+ "add x23, x23, #4\n"
+ "fmla v21.4s, v24.4s, v5.4s\n"
+ "ldr s10, [%[wbptr], #12]\n"
+ "str s19, [x25, x26]\n"
+ "mov v17.16b, v14.16b\n"
+ "str s22, [x24, x27]\n"
+ "mov v23.16b, v14.16b\n"
+ "fmla v21.4s, v16.4s, v7.4s\n"
+ "ldr s5, [%[wbptr], #32]\n"
+ "mov v24.16b, v14.16b\n"
+ "add x24, x24, #4\n"
+ "mov v20.16b, v14.16b\n"
+ "mov v16.16b, v14.16b\n"
+ "fmla v21.4s, v13.4s, v4.4s\n"
+ "ldr s7, [%[wbptr], #24]\n"
+ "mov v13.16b, v14.16b\n"
+ "mov v0.16b, v14.16b\n"
+ "mov v1.16b, v14.16b\n"
+ "mov v2.16b, v14.16b\n"
+ "str s21, [x25, x27]\n"
+ "mov v3.16b, v14.16b\n"
+ "ldr s4, [%[wbptr], #36]\n"
+ "add x25, x25, #4\n"
+ "fmla v17.4s, v29.4s, v12.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "fmla v17.4s, v28.4s, v9.4s\n"
+ "prfm pldl1keep, [x8, x16]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr s22, [x8, x15]\n"
+ "fmla v24.4s, v30.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "ldr s29, [%[inptr0], x17]\n"
+ "fmla v23.4s, v25.4s, v9.4s\n"
+ "prfm pldl1keep, [x11, #64]\n"
+ "fmla v20.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x10, x28]\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "ldr s25, [x11]\n"
+ "fmla v23.4s, v26.4s, v11.4s\n"
+ "prfm pldl1keep, [x9, x16]\n"
+ "fmla v24.4s, v26.4s, v9.4s\n"
+ "prfm pldl1keep, [x8, x18]\n"
+ "fmla v17.4s, v26.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x20]\n"
+ "fmla v16.4s, v26.4s, v12.4s\n"
+ "ldr s28, [x10, %[input_col_stride1]]\n"
+ "fmla v24.4s, v27.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, #64]\n"
+ "fmla v17.4s, v27.4s, v10.4s\n"
+ "prfm pldl1keep, [x11, x28]\n"
+ "fmla v13.4s, v27.4s, v12.4s\n"
+ "ldr s19, [x9, x15]\n"
+ "fmla v23.4s, v15.4s, v6.4s\n"
+ "prfm pldl1keep, [x10, x16]\n"
+ "fmla v20.4s, v15.4s, v9.4s\n"
+ "prfm pldl1keep, [x9, x18]\n"
+ "fmla v0.4s, v15.4s, v12.4s\n"
+ "ldr s21, [x8, x17]\n"
+ "fmla v17.4s, v18.4s, v5.4s\n"
+ "prfm pldl1keep, [x8, x20]\n"
+ "fmla v23.4s, v18.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x22]\n"
+ "fmla v24.4s, v18.4s, v6.4s\n"
+ "prfm pldl1keep, [x12, x28]\n"
+ "fmla v20.4s, v18.4s, v11.4s\n"
+ "prfm pldl1keep, [x11, x16]\n"
+ "fmla v16.4s, v18.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x18]\n"
+ "fmla v1.4s, v18.4s, v12.4s\n"
+ "ldr s27, [%[inptr0], x19]\n"
+ "fmla v17.4s, v22.4s, v7.4s\n"
+ "prfm pldl1keep, [x9, x20]\n"
+ "fmla v23.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [x8, x22]\n"
+ "fmla v24.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [x12, x16]\n"
+ "fmla v16.4s, v22.4s, v11.4s\n"
+ "prfm pldl1keep, [x11, x18]\n"
+ "fmla v13.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x20]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "ldr s18, [x12]\n"
+ "fmla v24.4s, v29.4s, v10.4s\n"
+ "prfm pldl1keep, [x9, x22]\n"
+ "fmla v13.4s, v29.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, x18]\n"
+ "fmla v3.4s, v29.4s, v12.4s\n"
+ "ldr s22, [x11, %[input_col_stride1]]\n"
+ "fmla v20.4s, v25.4s, v6.4s\n"
+ "prfm pldl1keep, [x11, x20]\n"
+ "fmla v0.4s, v25.4s, v9.4s\n"
+ "ldr s25, [x10, x15]\n"
+ "fmla v23.4s, v28.4s, v5.4s\n"
+ "prfm pldl1keep, [x10, x22]\n"
+ "fmla v20.4s, v28.4s, v8.4s\n"
+ "prfm pldl1keep, [x12, x20]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "prfm pldl1keep, [x11, x22]\n"
+ "fmla v0.4s, v28.4s, v11.4s\n"
+ "prfm pldl1keep, [x12, x22]\n"
+ "fmla v1.4s, v28.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v17.4s, v19.4s, v4.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v23.4s, v19.4s, v7.4s\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v20.4s, v19.4s, v10.4s\n"
+ "fmla v16.4s, v19.4s, v8.4s\n"
+ "str s17, [%[outptr0]]\n"
+ "mov v15.16b, v14.16b\n"
+ "fmla v13.4s, v19.4s, v6.4s\n"
+ "fmla v1.4s, v19.4s, v11.4s\n"
+ "fmla v15.4s, v28.4s, v12.4s\n"
+ "ldr s29, [x9, x17]\n"
+ "fmla v2.4s, v19.4s, v9.4s\n"
+ "fmla v24.4s, v21.4s, v7.4s\n"
+ "fmla v16.4s, v21.4s, v10.4s\n"
+ "fmla v13.4s, v21.4s, v8.4s\n"
+ "fmla v3.4s, v21.4s, v9.4s\n"
+ "fmla v0.4s, v18.4s, v6.4s\n"
+ "mov v18.16b, v14.16b\n"
+ "fmla v2.4s, v21.4s, v11.4s\n"
+ "fmla v13.4s, v27.4s, v10.4s\n"
+ "fmla v20.4s, v22.4s, v5.4s\n"
+ "fmla v18.4s, v19.4s, v12.4s\n"
+ "ldr s26, [x8, x19]\n"
+ "fmla v3.4s, v27.4s, v11.4s\n"
+ "ldr s28, [%[inptr0], x21]\n"
+ "fmla v0.4s, v22.4s, v8.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v1.4s, v22.4s, v6.4s\n"
+ "fmla v15.4s, v22.4s, v9.4s\n"
+ "mov v17.16b, v14.16b\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "fmla v20.4s, v25.4s, v7.4s\n"
+ "fmla v16.4s, v25.4s, v5.4s\n"
+ "fmla v17.4s, v21.4s, v12.4s\n"
+ "ldr s30, [x12, %[input_col_stride1]]\n"
+ "str s23, [x23]\n"
+ "mov v19.16b, v14.16b\n"
+ "fmla v0.4s, v25.4s, v10.4s\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "fmla v2.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "mov v22.16b, v14.16b\n"
+ "mov v21.16b, v14.16b\n"
+ "fmla v24.4s, v29.4s, v4.4s\n"
+ "fmla v16.4s, v29.4s, v7.4s\n"
+ "fmla v13.4s, v29.4s, v5.4s\n"
+ "fmla v1.4s, v29.4s, v10.4s\n"
+ "fmla v2.4s, v29.4s, v8.4s\n"
+ "fmla v3.4s, v29.4s, v6.4s\n"
+ "str s24, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "ldr s27, [x11, x15]\n"
+ "fmla v22.4s, v29.4s, v12.4s\n"
+ "ldr s23, [x10, x17]\n"
+ "fmla v13.4s, v26.4s, v7.4s\n"
+ "fmla v2.4s, v26.4s, v10.4s\n"
+ "fmla v3.4s, v26.4s, v8.4s\n"
+ "fmla v17.4s, v26.4s, v11.4s\n"
+ "fmla v0.4s, v30.4s, v5.4s\n"
+ "ldr s24, [x9, x19]\n"
+ "fmla v15.4s, v30.4s, v6.4s\n"
+ "ldr s29, [x8, x21]\n"
+ "fmla v3.4s, v28.4s, v10.4s\n"
+ "ldr s14, [x12, x15]\n"
+ "fmla v20.4s, v27.4s, v4.4s\n"
+ "add x8, x8, #4\n"
+ "fmla v0.4s, v27.4s, v7.4s\n"
+ "fmla v1.4s, v27.4s, v5.4s\n"
+ "fmla v15.4s, v27.4s, v8.4s\n"
+ "fmla v18.4s, v27.4s, v6.4s\n"
+ "str s20, [x24]\n"
+ "fmla v19.4s, v27.4s, v9.4s\n"
+ "fmla v16.4s, v23.4s, v4.4s\n"
+ "ldr s25, [x11, x17]\n"
+ "fmla v1.4s, v23.4s, v7.4s\n"
+ "ldr s30, [x10, x19]\n"
+ "fmla v2.4s, v23.4s, v5.4s\n"
+ "fmla v15.4s, v23.4s, v10.4s\n"
+ "str s16, [x23, %[output_col_stride1]]\n"
+ "fmla v18.4s, v23.4s, v8.4s\n"
+ "fmla v17.4s, v23.4s, v6.4s\n"
+ "ldr s26, [x9, x21]\n"
+ "fmla v19.4s, v23.4s, v11.4s\n"
+ "add x9, x9, #4\n"
+ "fmla v22.4s, v23.4s, v9.4s\n"
+ "fmla v21.4s, v23.4s, v12.4s\n"
+ "fmla v13.4s, v24.4s, v4.4s\n"
+ "ldr s27, [x12, x17]\n"
+ "fmla v2.4s, v24.4s, v7.4s\n"
+ "ldr s20, [x11, x19]\n"
+ "fmla v3.4s, v24.4s, v5.4s\n"
+ "fmla v18.4s, v24.4s, v10.4s\n"
+ "str s13, [%[outptr0], x26]\n"
+ "fmla v17.4s, v24.4s, v8.4s\n"
+ "fmla v22.4s, v24.4s, v11.4s\n"
+ "ldr s23, [x10, x21]\n"
+ "fmla v3.4s, v29.4s, v7.4s\n"
+ "ldr s24, [x12, x19]\n"
+ "fmla v17.4s, v29.4s, v10.4s\n"
+ "ldr s16, [x11, x21]\n"
+ "fmla v0.4s, v14.4s, v4.4s\n"
+ "add x10, x10, #4\n"
+ "fmla v15.4s, v14.4s, v5.4s\n"
+ "add x11, x11, #4\n"
+ "fmla v19.4s, v14.4s, v6.4s\n"
+ "ldr s13, [x12, x21]\n"
+ "str s0, [x25]\n"
+ "fmla v1.4s, v25.4s, v4.4s\n"
+ "fmla v15.4s, v25.4s, v7.4s\n"
+ "add x12, x12, #4\n"
+ "fmla v18.4s, v25.4s, v5.4s\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "str s1, [x24, %[output_col_stride1]]\n"
+ "fmla v22.4s, v25.4s, v6.4s\n"
+ "fmla v21.4s, v25.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v4.4s\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "fmla v17.4s, v30.4s, v5.4s\n"
+ "fmla v19.4s, v30.4s, v10.4s\n"
+ "fmla v22.4s, v30.4s, v8.4s\n"
+ "str s2, [x23, x26]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "fmla v3.4s, v26.4s, v4.4s\n"
+ "fmla v17.4s, v26.4s, v7.4s\n"
+ "fmla v22.4s, v26.4s, v10.4s\n"
+ "fmla v15.4s, v27.4s, v4.4s\n"
+ "fmla v19.4s, v27.4s, v5.4s\n"
+ "fmla v21.4s, v27.4s, v6.4s\n"
+ "str s3, [%[outptr0], x27]\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "str s15, [x25, %[output_col_stride1]]\n"
+ "fmla v22.4s, v20.4s, v5.4s\n"
+ "fmla v19.4s, v20.4s, v7.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "str s18, [x24, x26]\n"
+ "fmla v21.4s, v20.4s, v8.4s\n"
+ "fmla v17.4s, v23.4s, v4.4s\n"
+ "fmla v22.4s, v23.4s, v7.4s\n"
+ "fmla v19.4s, v24.4s, v4.4s\n"
+ "fmla v21.4s, v23.4s, v10.4s\n"
+ "str s17, [x23, x27]\n"
+ "fmla v22.4s, v16.4s, v4.4s\n"
+ "str s19, [x25, x26]\n"
+ "add x23, x23, #4\n"
+ "fmla v21.4s, v24.4s, v5.4s\n"
+ "str s22, [x24, x27]\n"
+ "add x24, x24, #4\n"
+ "fmla v21.4s, v16.4s, v7.4s\n"
+ "fmla v21.4s, v13.4s, v4.4s\n"
+ "str s21, [x25, x27]\n"
+ "add x25, x25, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
+ : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
);
+}
- int channels_remaining = n_channels;
- if (channels_remaining >= 4 && same_strides)
- {
- int c4_rem = channels_remaining / 4;
- channels_remaining %= 4;
- const int prefetch_depth = 8;
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::None>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[6][6],
+ float *outptrs[4][4]
+)
+{
+ __asm __volatile(
+ "mov x27, xzr\n"
+ "mov x28, xzr\n"
+ "and x15, %[n_channels], #3\n"
+ "lsr x16, %[n_channels], #2\n"
+ "cbz x16, 4f\n"
+ "1:\n"
+ "ldr q13, [%[wbptr]]\n"
+ "ldr x17, [%[inptrs], 0]\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr q10, [%[wbptr], #48]\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr q9, [%[wbptr], #64]\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr q8, [%[wbptr], #80]\n"
+ "mov v14.16b, v13.16b\n"
+ "ldr q7, [%[wbptr], #96]\n"
+ "mov v0.16b, v13.16b\n"
+ "ldr q6, [%[wbptr], #112]\n"
+ "mov v1.16b, v13.16b\n"
+ "ldr q5, [%[wbptr], #128]\n"
+ "mov v2.16b, v13.16b\n"
+ "ldr q4, [%[wbptr], #144]\n"
+ "ldr q29, [x17, x27]\n"
+ "ldr x18, [%[inptrs], 48]\n"
+ "fmla v18.4s, v29.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 8]\n"
+ "ldr q27, [x18, x27]\n"
+ "ldr x19, [%[inptrs], 96]\n"
+ "ldr q28, [x17, x27]\n"
+ "ldr x18, [%[inptrs], 56]\n"
+ "ldr q25, [x19, x27]\n"
+ "ldr x17, [%[inptrs], 16]\n"
+ "ldr q16, [x18, x27]\n"
+ "ldr x20, [%[inptrs], 144]\n"
+ "ldr q15, [x17, x27]\n"
+ "ldr x19, [%[inptrs], 104]\n"
+ "ldr q21, [x20, x27]\n"
+ "subs x16, x16, #1\n"
+ "ldr q29, [x19, x27]\n"
+ "beq 3f\n"
+ "2:\n"
+ "mov v3.16b, v13.16b\n"
+ "ldr x18, [%[inptrs], 64]\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "ldr x17, [%[inptrs], 24]\n"
+ "fmla v22.4s, v27.4s, v12.4s\n"
+ "ldr q30, [x18, x27]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr x21, [%[inptrs], 192]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 152]\n"
+ "fmla v18.4s, v28.4s, v11.4s\n"
+ "ldr q24, [x17, x27]\n"
+ "fmla v22.4s, v25.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 112]\n"
+ "fmla v23.4s, v16.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 72]\n"
+ "fmla v17.4s, v16.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 32]\n"
+ "fmla v18.4s, v25.4s, v6.4s\n"
+ "ldr q31, [x21, x27]\n"
+ "fmla v22.4s, v16.4s, v11.4s\n"
+ "ldr x22, [%[inptrs], 240]\n"
+ "fmla v23.4s, v15.4s, v11.4s\n"
+ "ldr x21, [%[inptrs], 200]\n"
+ "fmla v14.4s, v15.4s, v12.4s\n"
+ "ldr x23, [%[outptrs], 0]\n"
+ "fmla v18.4s, v16.4s, v8.4s\n"
+ "ldr q25, [x20, x27]\n"
+ "fmla v22.4s, v21.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v19.4s, v21.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 32]\n"
+ "fmla v0.4s, v21.4s, v12.4s\n"
+ "ldr q21, [x19, x27]\n"
+ "fmla v18.4s, v15.4s, v10.4s\n"
+ "ldr q20, [x18, x27]\n"
+ "fmla v22.4s, v29.4s, v8.4s\n"
+ "ldr x19, [%[inptrs], 120]\n"
+ "fmla v23.4s, v29.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 80]\n"
+ "fmla v19.4s, v29.4s, v11.4s\n"
+ "ldr x25, [%[outptrs], 64]\n"
+ "fmla v18.4s, v29.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 96]\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "ldr q26, [x17, x27]\n"
+ "fmla v22.4s, v30.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 40]\n"
+ "fmla v23.4s, v30.4s, v8.4s\n"
+ "subs x16, x16, #1\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "fmla v14.4s, v30.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "ldr q27, [x22, x27]\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "ldr x22, [%[inptrs], 248]\n"
+ "fmla v23.4s, v24.4s, v10.4s\n"
+ "fmla v19.4s, v31.4s, v6.4s\n"
+ "fmla v14.4s, v24.4s, v11.4s\n"
+ "ldr q30, [x21, x27]\n"
+ "fmla v0.4s, v31.4s, v9.4s\n"
+ "ldr q24, [x20, x27]\n"
+ "fmla v22.4s, v25.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 208]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "fmla v1.4s, v25.4s, v9.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v21.4s, v4.4s\n"
+ "fmla v22.4s, v21.4s, v7.4s\n"
+ "fmla v23.4s, v21.4s, v5.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v14.4s, v21.4s, v6.4s\n"
+ "fmla v17.4s, v21.4s, v8.4s\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "str q18, [x23, x28]\n"
+ "mov v16.16b, v13.16b\n"
+ "fmla v2.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 8]\n"
+ "fmla v23.4s, v20.4s, v7.4s\n"
+ "fmla v14.4s, v20.4s, v8.4s\n"
+ "fmla v16.4s, v25.4s, v12.4s\n"
+ "ldr q25, [x19, x27]\n"
+ "fmla v17.4s, v20.4s, v10.4s\n"
+ "ldr x19, [%[inptrs], 128]\n"
+ "fmla v2.4s, v20.4s, v11.4s\n"
+ "fmla v3.4s, v20.4s, v9.4s\n"
+ "fmla v14.4s, v26.4s, v10.4s\n"
+ "fmla v0.4s, v27.4s, v6.4s\n"
+ "mov v15.16b, v13.16b\n"
+ "fmla v19.4s, v30.4s, v5.4s\n"
+ "fmla v1.4s, v30.4s, v6.4s\n"
+ "fmla v16.4s, v30.4s, v9.4s\n"
+ "fmla v3.4s, v26.4s, v11.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "ldr q27, [x17, x27]\n"
+ "fmla v0.4s, v30.4s, v8.4s\n"
+ "ldr q28, [x22, x27]\n"
+ "fmla v22.4s, v24.4s, v4.4s\n"
+ "ldr x18, [%[inptrs], 88]\n"
+ "fmla v19.4s, v24.4s, v7.4s\n"
+ "ldr x22, [%[inptrs], 256]\n"
+ "fmla v17.4s, v24.4s, v5.4s\n"
+ "ldr x17, [%[inptrs], 0]\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v8.4s\n"
+ "str q22, [x24, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v2.4s, v24.4s, v6.4s\n"
+ "ldr x24, [%[outptrs], 40]\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "fmla v18.4s, v20.4s, v12.4s\n"
+ "ldr q22, [x21, x27]\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 216]\n"
+ "fmla v17.4s, v25.4s, v7.4s\n"
+ "fmla v14.4s, v25.4s, v5.4s\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v8.4s\n"
+ "fmla v3.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "str q23, [x23, x28]\n"
+ "mov v21.16b, v13.16b\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 16]\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "fmla v2.4s, v29.4s, v10.4s\n"
+ "fmla v21.4s, v24.4s, v12.4s\n"
+ "ldr q30, [x20, x27]\n"
+ "fmla v3.4s, v29.4s, v8.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "ldr q31, [x19, x27]\n"
+ "fmla v0.4s, v28.4s, v5.4s\n"
+ "ldr x19, [%[inptrs], 136]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "ldr q26, [x18, x27]\n"
+ "fmla v3.4s, v27.4s, v10.4s\n"
+ "ldr q23, [x22, x27]\n"
+ "fmla v19.4s, v22.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 264]\n"
+ "fmla v0.4s, v22.4s, v7.4s\n"
+ "ldr x18, [%[inptrs], 48]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "fmla v16.4s, v22.4s, v8.4s\n"
+ "fmla v15.4s, v22.4s, v6.4s\n"
+ "fmla v21.4s, v22.4s, v9.4s\n"
+ "str q19, [x25, x28]\n"
+ "mov v24.16b, v13.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr q27, [x21, x27]\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 224]\n"
+ "fmla v24.4s, v25.4s, v12.4s\n"
+ "ldr q28, [x20, x27]\n"
+ "fmla v1.4s, v30.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v2.4s, v30.4s, v5.4s\n"
+ "ldr x25, [%[outptrs], 72]\n"
+ "str q17, [x24, x28]\n"
+ "fmla v16.4s, v30.4s, v10.4s\n"
+ "fmla v15.4s, v30.4s, v8.4s\n"
+ "ldr q22, [x19, x27]\n"
+ "fmla v18.4s, v30.4s, v6.4s\n"
+ "ldr x24, [%[outptrs], 48]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr x19, [%[inptrs], 96]\n"
+ "fmla v24.4s, v30.4s, v9.4s\n"
+ "fmla v20.4s, v30.4s, v12.4s\n"
+ "fmla v14.4s, v31.4s, v4.4s\n"
+ "ldr q30, [x22, x27]\n"
+ "fmla v2.4s, v31.4s, v7.4s\n"
+ "ldr q19, [x21, x27]\n"
+ "fmla v3.4s, v31.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 272]\n"
+ "fmla v15.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%[inptrs], 232]\n"
+ "str q14, [x23, x28]\n"
+ "fmla v18.4s, v31.4s, v8.4s\n"
+ "fmla v24.4s, v31.4s, v11.4s\n"
+ "ldr q31, [x20, x27]\n"
+ "fmla v3.4s, v26.4s, v7.4s\n"
+ "ldr q17, [x22, x27]\n"
+ "fmla v0.4s, v23.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 280]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr q14, [x21, x27]\n"
+ "fmla v16.4s, v23.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 24]\n"
+ "fmla v21.4s, v23.4s, v6.4s\n"
+ "ldr q26, [x22, x27]\n"
+ "str q0, [x26, x28]\n"
+ "fmla v1.4s, v27.4s, v4.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "ldr q13, [%[wbptr]]\n"
+ "fmla v16.4s, v27.4s, v7.4s\n"
+ "ldr x26, [%[outptrs], 104]\n"
+ "fmla v21.4s, v27.4s, v8.4s\n"
+ "add x27, x27, #16\n"
+ "str q1, [x25, x28]\n"
+ "fmla v24.4s, v27.4s, v6.4s\n"
+ "fmla v20.4s, v27.4s, v9.4s\n"
+ "ldr q12, [%[wbptr], #16]\n"
+ "fmla v2.4s, v28.4s, v4.4s\n"
+ "ldr q29, [x17, x27]\n"
+ "fmla v15.4s, v28.4s, v7.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v18.4s, v28.4s, v5.4s\n"
+ "ldr x25, [%[outptrs], 80]\n"
+ "fmla v21.4s, v28.4s, v10.4s\n"
+ "ldr x17, [%[inptrs], 8]\n"
+ "str q2, [x24, x28]\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "fmla v20.4s, v28.4s, v11.4s\n"
+ "ldr q9, [%[wbptr], #64]\n"
+ "fmla v3.4s, v22.4s, v4.4s\n"
+ "ldr q28, [x17, x27]\n"
+ "fmla v18.4s, v22.4s, v7.4s\n"
+ "ldr q25, [x19, x27]\n"
+ "fmla v24.4s, v22.4s, v10.4s\n"
+ "ldr x24, [%[outptrs], 56]\n"
+ "fmla v16.4s, v30.4s, v4.4s\n"
+ "ldr q11, [%[wbptr], #32]\n"
+ "str q3, [x23, x28]\n"
+ "fmla v21.4s, v30.4s, v5.4s\n"
+ "fmla v20.4s, v30.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 56]\n"
+ "fmla v15.4s, v19.4s, v4.4s\n"
+ "ldr x17, [%[inptrs], 16]\n"
+ "str q16, [x26, x28]\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v21.4s, v19.4s, v7.4s\n"
+ "ldr q16, [x18, x27]\n"
+ "fmla v20.4s, v19.4s, v8.4s\n"
+ "ldr q6, [%[wbptr], #112]\n"
+ "str q15, [x25, x28]\n"
+ "fmla v18.4s, v31.4s, v4.4s\n"
+ "fmla v24.4s, v31.4s, v7.4s\n"
+ "ldr q15, [x17, x27]\n"
+ "fmla v21.4s, v17.4s, v4.4s\n"
+ "ldr x25, [%[outptrs], 88]\n"
+ "fmla v20.4s, v31.4s, v10.4s\n"
+ "ldr q8, [%[wbptr], #80]\n"
+ "str q18, [x24, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v24.4s, v14.4s, v4.4s\n"
+ "ldr x26, [%[outptrs], 112]\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr x20, [%[inptrs], 144]\n"
+ "str q21, [x26, x28]\n"
+ "fmla v20.4s, v17.4s, v5.4s\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr q10, [%[wbptr], #48]\n"
+ "str q24, [x25, x28]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr q21, [x20, x27]\n"
+ "fmla v20.4s, v14.4s, v7.4s\n"
+ "ldr q5, [%[wbptr], #128]\n"
+ "mov v14.16b, v13.16b\n"
+ "ldr x26, [%[outptrs], 120]\n"
+ "mov v0.16b, v13.16b\n"
+ "ldr x19, [%[inptrs], 104]\n"
+ "mov v1.16b, v13.16b\n"
+ "mov v2.16b, v13.16b\n"
+ "fmla v20.4s, v26.4s, v4.4s\n"
+ "ldr q7, [%[wbptr], #96]\n"
+ "fmla v18.4s, v29.4s, v12.4s\n"
+ "ldr q29, [x19, x27]\n"
+ "str q20, [x26, x28]\n"
+ "ldr q4, [%[wbptr], #144]\n"
+ "add x28, x28, #16\n"
+ "bne 2b\n"
+ "3:\n"
+ "mov v3.16b, v13.16b\n"
+ "ldr x18, [%[inptrs], 64]\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "ldr x17, [%[inptrs], 24]\n"
+ "fmla v22.4s, v27.4s, v12.4s\n"
+ "ldr q30, [x18, x27]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr x21, [%[inptrs], 192]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 152]\n"
+ "fmla v18.4s, v28.4s, v11.4s\n"
+ "ldr q24, [x17, x27]\n"
+ "fmla v22.4s, v25.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 112]\n"
+ "fmla v23.4s, v16.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 72]\n"
+ "fmla v17.4s, v16.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 32]\n"
+ "fmla v18.4s, v25.4s, v6.4s\n"
+ "ldr q31, [x21, x27]\n"
+ "fmla v22.4s, v16.4s, v11.4s\n"
+ "ldr x22, [%[inptrs], 240]\n"
+ "fmla v23.4s, v15.4s, v11.4s\n"
+ "ldr x21, [%[inptrs], 200]\n"
+ "fmla v14.4s, v15.4s, v12.4s\n"
+ "ldr x23, [%[outptrs], 0]\n"
+ "fmla v18.4s, v16.4s, v8.4s\n"
+ "ldr q25, [x20, x27]\n"
+ "fmla v22.4s, v21.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v19.4s, v21.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 32]\n"
+ "fmla v0.4s, v21.4s, v12.4s\n"
+ "ldr q21, [x19, x27]\n"
+ "fmla v18.4s, v15.4s, v10.4s\n"
+ "ldr q20, [x18, x27]\n"
+ "fmla v22.4s, v29.4s, v8.4s\n"
+ "ldr x19, [%[inptrs], 120]\n"
+ "fmla v23.4s, v29.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 80]\n"
+ "fmla v19.4s, v29.4s, v11.4s\n"
+ "ldr x25, [%[outptrs], 64]\n"
+ "fmla v18.4s, v29.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 96]\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "ldr q26, [x17, x27]\n"
+ "fmla v22.4s, v30.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 40]\n"
+ "fmla v23.4s, v30.4s, v8.4s\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "fmla v14.4s, v30.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "mov v16.16b, v13.16b\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "fmla v19.4s, v31.4s, v6.4s\n"
+ "fmla v0.4s, v31.4s, v9.4s\n"
+ "mov v15.16b, v13.16b\n"
+ "fmla v23.4s, v24.4s, v10.4s\n"
+ "fmla v14.4s, v24.4s, v11.4s\n"
+ "ldr q27, [x22, x27]\n"
+ "fmla v22.4s, v25.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 248]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "fmla v1.4s, v25.4s, v9.4s\n"
+ "fmla v16.4s, v25.4s, v12.4s\n"
+ "ldr q30, [x21, x27]\n"
+ "fmla v18.4s, v21.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 208]\n"
+ "fmla v22.4s, v21.4s, v7.4s\n"
+ "fmla v23.4s, v21.4s, v5.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v17.4s, v21.4s, v8.4s\n"
+ "fmla v14.4s, v21.4s, v6.4s\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "str q18, [x23, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v2.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 8]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "ldr q24, [x20, x27]\n"
+ "fmla v23.4s, v20.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v17.4s, v20.4s, v10.4s\n"
+ "fmla v14.4s, v20.4s, v8.4s\n"
+ "fmla v2.4s, v20.4s, v11.4s\n"
+ "fmla v3.4s, v20.4s, v9.4s\n"
+ "fmla v18.4s, v20.4s, v12.4s\n"
+ "ldr q25, [x19, x27]\n"
+ "fmla v0.4s, v27.4s, v6.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v14.4s, v26.4s, v10.4s\n"
+ "ldr x19, [%[inptrs], 128]\n"
+ "fmla v3.4s, v26.4s, v11.4s\n"
+ "ldr q27, [x17, x27]\n"
+ "fmla v19.4s, v30.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 88]\n"
+ "fmla v0.4s, v30.4s, v8.4s\n"
+ "fmla v1.4s, v30.4s, v6.4s\n"
+ "fmla v16.4s, v30.4s, v9.4s\n"
+ "ldr q28, [x22, x27]\n"
+ "fmla v22.4s, v24.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 256]\n"
+ "fmla v19.4s, v24.4s, v7.4s\n"
+ "fmla v17.4s, v24.4s, v5.4s\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v8.4s\n"
+ "fmla v2.4s, v24.4s, v6.4s\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "str q22, [x24, x28]\n"
+ "mov v21.16b, v13.16b\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 40]\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "fmla v17.4s, v25.4s, v7.4s\n"
+ "fmla v21.4s, v24.4s, v12.4s\n"
+ "ldr q22, [x21, x27]\n"
+ "fmla v14.4s, v25.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 216]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v8.4s\n"
+ "str q23, [x23, x28]\n"
+ "mov v24.16b, v13.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr x23, [%[outptrs], 16]\n"
+ "fmla v3.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "fmla v24.4s, v25.4s, v12.4s\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "ldr q30, [x20, x27]\n"
+ "fmla v2.4s, v29.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmla v3.4s, v29.4s, v8.4s\n"
+ "fmla v0.4s, v28.4s, v5.4s\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "ldr q31, [x19, x27]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "ldr q26, [x18, x27]\n"
+ "fmla v19.4s, v22.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 136]\n"
+ "fmla v3.4s, v27.4s, v10.4s\n"
+ "ldr q23, [x22, x27]\n"
+ "fmla v0.4s, v22.4s, v7.4s\n"
+ "ldr x22, [%[inptrs], 264]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "fmla v16.4s, v22.4s, v8.4s\n"
+ "str q19, [x25, x28]\n"
+ "fmla v15.4s, v22.4s, v6.4s\n"
+ "fmla v21.4s, v22.4s, v9.4s\n"
+ "ldr q27, [x21, x27]\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "ldr q28, [x20, x27]\n"
+ "fmla v1.4s, v30.4s, v7.4s\n"
+ "ldr x21, [%[inptrs], 224]\n"
+ "fmla v2.4s, v30.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v16.4s, v30.4s, v10.4s\n"
+ "ldr x25, [%[outptrs], 72]\n"
+ "str q17, [x24, x28]\n"
+ "fmla v15.4s, v30.4s, v8.4s\n"
+ "fmla v18.4s, v30.4s, v6.4s\n"
+ "ldr q22, [x19, x27]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr x24, [%[outptrs], 48]\n"
+ "fmla v24.4s, v30.4s, v9.4s\n"
+ "fmla v20.4s, v30.4s, v12.4s\n"
+ "fmla v14.4s, v31.4s, v4.4s\n"
+ "ldr q30, [x22, x27]\n"
+ "fmla v2.4s, v31.4s, v7.4s\n"
+ "ldr q19, [x21, x27]\n"
+ "fmla v3.4s, v31.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 272]\n"
+ "fmla v15.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%[inptrs], 232]\n"
+ "str q14, [x23, x28]\n"
+ "fmla v18.4s, v31.4s, v8.4s\n"
+ "fmla v24.4s, v31.4s, v11.4s\n"
+ "ldr q31, [x20, x27]\n"
+ "fmla v3.4s, v26.4s, v7.4s\n"
+ "ldr q17, [x22, x27]\n"
+ "fmla v0.4s, v23.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 280]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr q14, [x21, x27]\n"
+ "fmla v16.4s, v23.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 24]\n"
+ "fmla v21.4s, v23.4s, v6.4s\n"
+ "ldr q26, [x22, x27]\n"
+ "str q0, [x26, x28]\n"
+ "fmla v1.4s, v27.4s, v4.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 104]\n"
+ "fmla v16.4s, v27.4s, v7.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v21.4s, v27.4s, v8.4s\n"
+ "fmla v24.4s, v27.4s, v6.4s\n"
+ "str q1, [x25, x28]\n"
+ "fmla v20.4s, v27.4s, v9.4s\n"
+ "fmla v2.4s, v28.4s, v4.4s\n"
+ "ldr x25, [%[outptrs], 80]\n"
+ "fmla v15.4s, v28.4s, v7.4s\n"
+ "fmla v18.4s, v28.4s, v5.4s\n"
+ "fmla v21.4s, v28.4s, v10.4s\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "fmla v20.4s, v28.4s, v11.4s\n"
+ "fmla v3.4s, v22.4s, v4.4s\n"
+ "str q2, [x24, x28]\n"
+ "fmla v16.4s, v30.4s, v4.4s\n"
+ "fmla v18.4s, v22.4s, v7.4s\n"
+ "ldr x24, [%[outptrs], 56]\n"
+ "fmla v24.4s, v22.4s, v10.4s\n"
+ "fmla v21.4s, v30.4s, v5.4s\n"
+ "str q3, [x23, x28]\n"
+ "fmla v20.4s, v30.4s, v6.4s\n"
+ "str q16, [x26, x28]\n"
+ "fmla v15.4s, v19.4s, v4.4s\n"
+ "fmla v18.4s, v31.4s, v4.4s\n"
+ "ldr x26, [%[outptrs], 112]\n"
+ "fmla v21.4s, v19.4s, v7.4s\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v20.4s, v19.4s, v8.4s\n"
+ "str q15, [x25, x28]\n"
+ "str q18, [x24, x28]\n"
+ "ldr x25, [%[outptrs], 88]\n"
+ "fmla v24.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v17.4s, v4.4s\n"
+ "fmla v20.4s, v31.4s, v10.4s\n"
+ "str q21, [x26, x28]\n"
+ "fmla v20.4s, v17.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 120]\n"
+ "fmla v24.4s, v14.4s, v4.4s\n"
+ "fmla v20.4s, v14.4s, v7.4s\n"
+ "str q24, [x25, x28]\n"
+ "fmla v20.4s, v26.4s, v4.4s\n"
+ "str q20, [x26, x28]\n"
+ "add x28, x28, #16\n"
+ "4:\n"
+ "cbz x15, 7f\n"
+ "ldr s13, [%[wbptr]]\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr s10, [%[wbptr], #12]\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr s9, [%[wbptr], #16]\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr s8, [%[wbptr], #20]\n"
+ "mov v14.16b, v13.16b\n"
+ "ldr s7, [%[wbptr], #24]\n"
+ "mov v0.16b, v13.16b\n"
+ "ldr s6, [%[wbptr], #28]\n"
+ "mov v1.16b, v13.16b\n"
+ "ldr s5, [%[wbptr], #32]\n"
+ "mov v2.16b, v13.16b\n"
+ "ldr s4, [%[wbptr], #36]\n"
+ "ldr x17, [%[inptrs], 0]\n"
+ "ldr x18, [%[inptrs], 48]\n"
+ "ldr x19, [%[inptrs], 96]\n"
+ "ldr x20, [%[inptrs], 144]\n"
+ "subs x15, x15, #1\n"
+ "ldr s29, [x17, x27]\n"
+ "fmla v18.4s, v29.4s, v12.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "ldr s25, [x19, x27]\n"
+ "ldr x17, [%[inptrs], 8]\n"
+ "ldr s21, [x20, x27]\n"
+ "ldr x18, [%[inptrs], 56]\n"
+ "ldr s28, [x17, x27]\n"
+ "ldr x19, [%[inptrs], 104]\n"
+ "ldr s16, [x18, x27]\n"
+ "ldr x17, [%[inptrs], 16]\n"
+ "ldr s29, [x19, x27]\n"
+ "ldr s15, [x17, x27]\n"
+ "beq 6f\n"
+ "5:\n"
+ "mov v3.16b, v13.16b\n"
+ "ldr x18, [%[inptrs], 64]\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "ldr x17, [%[inptrs], 24]\n"
+ "fmla v22.4s, v27.4s, v12.4s\n"
+ "ldr s30, [x18, x27]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr x21, [%[inptrs], 192]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 152]\n"
+ "fmla v18.4s, v28.4s, v11.4s\n"
+ "ldr s24, [x17, x27]\n"
+ "fmla v22.4s, v25.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 112]\n"
+ "fmla v23.4s, v16.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 72]\n"
+ "fmla v17.4s, v16.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 32]\n"
+ "fmla v18.4s, v25.4s, v6.4s\n"
+ "ldr s31, [x21, x27]\n"
+ "fmla v22.4s, v16.4s, v11.4s\n"
+ "ldr x22, [%[inptrs], 240]\n"
+ "fmla v23.4s, v15.4s, v11.4s\n"
+ "ldr x21, [%[inptrs], 200]\n"
+ "fmla v14.4s, v15.4s, v12.4s\n"
+ "ldr x23, [%[outptrs], 0]\n"
+ "fmla v18.4s, v16.4s, v8.4s\n"
+ "ldr s25, [x20, x27]\n"
+ "fmla v22.4s, v21.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v19.4s, v21.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 32]\n"
+ "fmla v0.4s, v21.4s, v12.4s\n"
+ "ldr s21, [x19, x27]\n"
+ "fmla v18.4s, v15.4s, v10.4s\n"
+ "ldr s20, [x18, x27]\n"
+ "fmla v22.4s, v29.4s, v8.4s\n"
+ "ldr x19, [%[inptrs], 120]\n"
+ "fmla v23.4s, v29.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 80]\n"
+ "fmla v19.4s, v29.4s, v11.4s\n"
+ "ldr x25, [%[outptrs], 64]\n"
+ "fmla v18.4s, v29.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 96]\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "ldr s26, [x17, x27]\n"
+ "fmla v22.4s, v30.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 40]\n"
+ "fmla v23.4s, v30.4s, v8.4s\n"
+ "subs x15, x15, #1\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "fmla v14.4s, v30.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "ldr s27, [x22, x27]\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "ldr x22, [%[inptrs], 248]\n"
+ "fmla v23.4s, v24.4s, v10.4s\n"
+ "fmla v19.4s, v31.4s, v6.4s\n"
+ "fmla v14.4s, v24.4s, v11.4s\n"
+ "ldr s30, [x21, x27]\n"
+ "fmla v0.4s, v31.4s, v9.4s\n"
+ "ldr s24, [x20, x27]\n"
+ "fmla v22.4s, v25.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 208]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "fmla v1.4s, v25.4s, v9.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v21.4s, v4.4s\n"
+ "fmla v22.4s, v21.4s, v7.4s\n"
+ "fmla v23.4s, v21.4s, v5.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v14.4s, v21.4s, v6.4s\n"
+ "fmla v17.4s, v21.4s, v8.4s\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "str s18, [x23, x28]\n"
+ "mov v16.16b, v13.16b\n"
+ "fmla v2.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 8]\n"
+ "fmla v23.4s, v20.4s, v7.4s\n"
+ "fmla v14.4s, v20.4s, v8.4s\n"
+ "fmla v16.4s, v25.4s, v12.4s\n"
+ "ldr s25, [x19, x27]\n"
+ "fmla v17.4s, v20.4s, v10.4s\n"
+ "ldr x19, [%[inptrs], 128]\n"
+ "fmla v2.4s, v20.4s, v11.4s\n"
+ "fmla v3.4s, v20.4s, v9.4s\n"
+ "fmla v14.4s, v26.4s, v10.4s\n"
+ "fmla v0.4s, v27.4s, v6.4s\n"
+ "mov v15.16b, v13.16b\n"
+ "fmla v19.4s, v30.4s, v5.4s\n"
+ "fmla v1.4s, v30.4s, v6.4s\n"
+ "fmla v16.4s, v30.4s, v9.4s\n"
+ "fmla v3.4s, v26.4s, v11.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "ldr s27, [x17, x27]\n"
+ "fmla v0.4s, v30.4s, v8.4s\n"
+ "ldr s28, [x22, x27]\n"
+ "fmla v22.4s, v24.4s, v4.4s\n"
+ "ldr x18, [%[inptrs], 88]\n"
+ "fmla v19.4s, v24.4s, v7.4s\n"
+ "ldr x22, [%[inptrs], 256]\n"
+ "fmla v17.4s, v24.4s, v5.4s\n"
+ "ldr x17, [%[inptrs], 0]\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v8.4s\n"
+ "str s22, [x24, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v2.4s, v24.4s, v6.4s\n"
+ "ldr x24, [%[outptrs], 40]\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "fmla v18.4s, v20.4s, v12.4s\n"
+ "ldr s22, [x21, x27]\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 216]\n"
+ "fmla v17.4s, v25.4s, v7.4s\n"
+ "fmla v14.4s, v25.4s, v5.4s\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v8.4s\n"
+ "fmla v3.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "str s23, [x23, x28]\n"
+ "mov v21.16b, v13.16b\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 16]\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "fmla v2.4s, v29.4s, v10.4s\n"
+ "fmla v21.4s, v24.4s, v12.4s\n"
+ "ldr s30, [x20, x27]\n"
+ "fmla v3.4s, v29.4s, v8.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "ldr s31, [x19, x27]\n"
+ "fmla v0.4s, v28.4s, v5.4s\n"
+ "ldr x19, [%[inptrs], 136]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "ldr s26, [x18, x27]\n"
+ "fmla v3.4s, v27.4s, v10.4s\n"
+ "ldr s23, [x22, x27]\n"
+ "fmla v19.4s, v22.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 264]\n"
+ "fmla v0.4s, v22.4s, v7.4s\n"
+ "ldr x18, [%[inptrs], 48]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "fmla v16.4s, v22.4s, v8.4s\n"
+ "fmla v15.4s, v22.4s, v6.4s\n"
+ "fmla v21.4s, v22.4s, v9.4s\n"
+ "str s19, [x25, x28]\n"
+ "mov v24.16b, v13.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr s27, [x21, x27]\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 224]\n"
+ "fmla v24.4s, v25.4s, v12.4s\n"
+ "ldr s28, [x20, x27]\n"
+ "fmla v1.4s, v30.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v2.4s, v30.4s, v5.4s\n"
+ "ldr x25, [%[outptrs], 72]\n"
+ "str s17, [x24, x28]\n"
+ "fmla v16.4s, v30.4s, v10.4s\n"
+ "fmla v15.4s, v30.4s, v8.4s\n"
+ "ldr s22, [x19, x27]\n"
+ "fmla v18.4s, v30.4s, v6.4s\n"
+ "ldr x24, [%[outptrs], 48]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr x19, [%[inptrs], 96]\n"
+ "fmla v24.4s, v30.4s, v9.4s\n"
+ "fmla v20.4s, v30.4s, v12.4s\n"
+ "fmla v14.4s, v31.4s, v4.4s\n"
+ "ldr s30, [x22, x27]\n"
+ "fmla v2.4s, v31.4s, v7.4s\n"
+ "ldr s19, [x21, x27]\n"
+ "fmla v3.4s, v31.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 272]\n"
+ "fmla v15.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%[inptrs], 232]\n"
+ "str s14, [x23, x28]\n"
+ "fmla v18.4s, v31.4s, v8.4s\n"
+ "fmla v24.4s, v31.4s, v11.4s\n"
+ "ldr s31, [x20, x27]\n"
+ "fmla v3.4s, v26.4s, v7.4s\n"
+ "ldr s17, [x22, x27]\n"
+ "fmla v0.4s, v23.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 280]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr s14, [x21, x27]\n"
+ "fmla v16.4s, v23.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 24]\n"
+ "fmla v21.4s, v23.4s, v6.4s\n"
+ "ldr s26, [x22, x27]\n"
+ "str s0, [x26, x28]\n"
+ "fmla v1.4s, v27.4s, v4.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "ldr s13, [%[wbptr]]\n"
+ "fmla v16.4s, v27.4s, v7.4s\n"
+ "ldr x26, [%[outptrs], 104]\n"
+ "fmla v21.4s, v27.4s, v8.4s\n"
+ "add x27, x27, #4\n"
+ "str s1, [x25, x28]\n"
+ "fmla v24.4s, v27.4s, v6.4s\n"
+ "fmla v20.4s, v27.4s, v9.4s\n"
+ "ldr s12, [%[wbptr], #4]\n"
+ "fmla v2.4s, v28.4s, v4.4s\n"
+ "ldr s29, [x17, x27]\n"
+ "fmla v15.4s, v28.4s, v7.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v18.4s, v28.4s, v5.4s\n"
+ "ldr x25, [%[outptrs], 80]\n"
+ "fmla v21.4s, v28.4s, v10.4s\n"
+ "ldr x17, [%[inptrs], 8]\n"
+ "str s2, [x24, x28]\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "fmla v20.4s, v28.4s, v11.4s\n"
+ "ldr s9, [%[wbptr], #16]\n"
+ "fmla v3.4s, v22.4s, v4.4s\n"
+ "ldr s28, [x17, x27]\n"
+ "fmla v18.4s, v22.4s, v7.4s\n"
+ "ldr s25, [x19, x27]\n"
+ "fmla v24.4s, v22.4s, v10.4s\n"
+ "ldr x24, [%[outptrs], 56]\n"
+ "fmla v16.4s, v30.4s, v4.4s\n"
+ "ldr s11, [%[wbptr], #8]\n"
+ "str s3, [x23, x28]\n"
+ "fmla v21.4s, v30.4s, v5.4s\n"
+ "fmla v20.4s, v30.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 56]\n"
+ "fmla v15.4s, v19.4s, v4.4s\n"
+ "ldr x17, [%[inptrs], 16]\n"
+ "str s16, [x26, x28]\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v21.4s, v19.4s, v7.4s\n"
+ "ldr s16, [x18, x27]\n"
+ "fmla v20.4s, v19.4s, v8.4s\n"
+ "ldr s6, [%[wbptr], #28]\n"
+ "str s15, [x25, x28]\n"
+ "fmla v18.4s, v31.4s, v4.4s\n"
+ "fmla v24.4s, v31.4s, v7.4s\n"
+ "ldr s15, [x17, x27]\n"
+ "fmla v21.4s, v17.4s, v4.4s\n"
+ "ldr x25, [%[outptrs], 88]\n"
+ "fmla v20.4s, v31.4s, v10.4s\n"
+ "ldr s8, [%[wbptr], #20]\n"
+ "str s18, [x24, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v24.4s, v14.4s, v4.4s\n"
+ "ldr x26, [%[outptrs], 112]\n"
+ "mov v22.16b, v13.16b\n"
+ "ldr x20, [%[inptrs], 144]\n"
+ "str s21, [x26, x28]\n"
+ "fmla v20.4s, v17.4s, v5.4s\n"
+ "mov v23.16b, v13.16b\n"
+ "ldr s10, [%[wbptr], #12]\n"
+ "str s24, [x25, x28]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v17.16b, v13.16b\n"
+ "ldr s21, [x20, x27]\n"
+ "fmla v20.4s, v14.4s, v7.4s\n"
+ "ldr s5, [%[wbptr], #32]\n"
+ "mov v14.16b, v13.16b\n"
+ "ldr x26, [%[outptrs], 120]\n"
+ "mov v0.16b, v13.16b\n"
+ "ldr x19, [%[inptrs], 104]\n"
+ "mov v1.16b, v13.16b\n"
+ "mov v2.16b, v13.16b\n"
+ "fmla v20.4s, v26.4s, v4.4s\n"
+ "ldr s7, [%[wbptr], #24]\n"
+ "fmla v18.4s, v29.4s, v12.4s\n"
+ "ldr s29, [x19, x27]\n"
+ "str s20, [x26, x28]\n"
+ "ldr s4, [%[wbptr], #36]\n"
+ "add x28, x28, #4\n"
+ "bne 5b\n"
+ "6:\n"
+ "mov v3.16b, v13.16b\n"
+ "ldr x18, [%[inptrs], 64]\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "ldr x17, [%[inptrs], 24]\n"
+ "fmla v22.4s, v27.4s, v12.4s\n"
+ "ldr s30, [x18, x27]\n"
+ "fmla v23.4s, v28.4s, v12.4s\n"
+ "ldr x21, [%[inptrs], 192]\n"
+ "fmla v19.4s, v25.4s, v12.4s\n"
+ "ldr x20, [%[inptrs], 152]\n"
+ "fmla v18.4s, v28.4s, v11.4s\n"
+ "ldr s24, [x17, x27]\n"
+ "fmla v22.4s, v25.4s, v9.4s\n"
+ "ldr x19, [%[inptrs], 112]\n"
+ "fmla v23.4s, v16.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 72]\n"
+ "fmla v17.4s, v16.4s, v12.4s\n"
+ "ldr x17, [%[inptrs], 32]\n"
+ "fmla v18.4s, v25.4s, v6.4s\n"
+ "ldr s31, [x21, x27]\n"
+ "fmla v22.4s, v16.4s, v11.4s\n"
+ "ldr x22, [%[inptrs], 240]\n"
+ "fmla v23.4s, v15.4s, v11.4s\n"
+ "ldr x21, [%[inptrs], 200]\n"
+ "fmla v14.4s, v15.4s, v12.4s\n"
+ "ldr x23, [%[outptrs], 0]\n"
+ "fmla v18.4s, v16.4s, v8.4s\n"
+ "ldr s25, [x20, x27]\n"
+ "fmla v22.4s, v21.4s, v6.4s\n"
+ "ldr x20, [%[inptrs], 160]\n"
+ "fmla v19.4s, v21.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 32]\n"
+ "fmla v0.4s, v21.4s, v12.4s\n"
+ "ldr s21, [x19, x27]\n"
+ "fmla v18.4s, v15.4s, v10.4s\n"
+ "ldr s20, [x18, x27]\n"
+ "fmla v22.4s, v29.4s, v8.4s\n"
+ "ldr x19, [%[inptrs], 120]\n"
+ "fmla v23.4s, v29.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 80]\n"
+ "fmla v19.4s, v29.4s, v11.4s\n"
+ "ldr x25, [%[outptrs], 64]\n"
+ "fmla v18.4s, v29.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 96]\n"
+ "fmla v17.4s, v29.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "ldr s26, [x17, x27]\n"
+ "fmla v22.4s, v30.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v18.4s, v30.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 40]\n"
+ "fmla v23.4s, v30.4s, v8.4s\n"
+ "fmla v17.4s, v30.4s, v11.4s\n"
+ "fmla v14.4s, v30.4s, v9.4s\n"
+ "fmla v2.4s, v30.4s, v12.4s\n"
+ "mov v16.16b, v13.16b\n"
+ "fmla v3.4s, v24.4s, v12.4s\n"
+ "fmla v19.4s, v31.4s, v6.4s\n"
+ "fmla v0.4s, v31.4s, v9.4s\n"
+ "mov v15.16b, v13.16b\n"
+ "fmla v23.4s, v24.4s, v10.4s\n"
+ "fmla v14.4s, v24.4s, v11.4s\n"
+ "ldr s27, [x22, x27]\n"
+ "fmla v22.4s, v25.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 248]\n"
+ "fmla v19.4s, v25.4s, v8.4s\n"
+ "fmla v17.4s, v25.4s, v6.4s\n"
+ "fmla v0.4s, v25.4s, v11.4s\n"
+ "fmla v1.4s, v25.4s, v9.4s\n"
+ "fmla v16.4s, v25.4s, v12.4s\n"
+ "ldr s30, [x21, x27]\n"
+ "fmla v18.4s, v21.4s, v4.4s\n"
+ "ldr x21, [%[inptrs], 208]\n"
+ "fmla v22.4s, v21.4s, v7.4s\n"
+ "fmla v23.4s, v21.4s, v5.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v17.4s, v21.4s, v8.4s\n"
+ "fmla v14.4s, v21.4s, v6.4s\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "str s18, [x23, x28]\n"
+ "mov v18.16b, v13.16b\n"
+ "fmla v2.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 8]\n"
+ "fmla v15.4s, v21.4s, v12.4s\n"
+ "ldr s24, [x20, x27]\n"
+ "fmla v23.4s, v20.4s, v7.4s\n"
+ "ldr x20, [%[inptrs], 168]\n"
+ "fmla v17.4s, v20.4s, v10.4s\n"
+ "fmla v14.4s, v20.4s, v8.4s\n"
+ "fmla v2.4s, v20.4s, v11.4s\n"
+ "fmla v3.4s, v20.4s, v9.4s\n"
+ "fmla v18.4s, v20.4s, v12.4s\n"
+ "ldr s25, [x19, x27]\n"
+ "fmla v0.4s, v27.4s, v6.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v14.4s, v26.4s, v10.4s\n"
+ "ldr x19, [%[inptrs], 128]\n"
+ "fmla v3.4s, v26.4s, v11.4s\n"
+ "ldr s27, [x17, x27]\n"
+ "fmla v19.4s, v30.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 88]\n"
+ "fmla v0.4s, v30.4s, v8.4s\n"
+ "fmla v1.4s, v30.4s, v6.4s\n"
+ "fmla v16.4s, v30.4s, v9.4s\n"
+ "ldr s28, [x22, x27]\n"
+ "fmla v22.4s, v24.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 256]\n"
+ "fmla v19.4s, v24.4s, v7.4s\n"
+ "fmla v17.4s, v24.4s, v5.4s\n"
+ "fmla v0.4s, v24.4s, v10.4s\n"
+ "fmla v1.4s, v24.4s, v8.4s\n"
+ "fmla v2.4s, v24.4s, v6.4s\n"
+ "fmla v16.4s, v24.4s, v11.4s\n"
+ "str s22, [x24, x28]\n"
+ "mov v21.16b, v13.16b\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "ldr x24, [%[outptrs], 40]\n"
+ "fmla v23.4s, v25.4s, v4.4s\n"
+ "fmla v17.4s, v25.4s, v7.4s\n"
+ "fmla v21.4s, v24.4s, v12.4s\n"
+ "ldr s22, [x21, x27]\n"
+ "fmla v14.4s, v25.4s, v5.4s\n"
+ "ldr x21, [%[inptrs], 216]\n"
+ "fmla v1.4s, v25.4s, v10.4s\n"
+ "fmla v2.4s, v25.4s, v8.4s\n"
+ "str s23, [x23, x28]\n"
+ "mov v24.16b, v13.16b\n"
+ "mov v20.16b, v13.16b\n"
+ "ldr x23, [%[outptrs], 16]\n"
+ "fmla v3.4s, v25.4s, v6.4s\n"
+ "fmla v15.4s, v25.4s, v11.4s\n"
+ "fmla v18.4s, v25.4s, v9.4s\n"
+ "fmla v24.4s, v25.4s, v12.4s\n"
+ "fmla v14.4s, v29.4s, v7.4s\n"
+ "ldr s30, [x20, x27]\n"
+ "fmla v2.4s, v29.4s, v10.4s\n"
+ "ldr x20, [%[inptrs], 176]\n"
+ "fmla v3.4s, v29.4s, v8.4s\n"
+ "fmla v0.4s, v28.4s, v5.4s\n"
+ "fmla v18.4s, v29.4s, v11.4s\n"
+ "ldr s31, [x19, x27]\n"
+ "fmla v16.4s, v28.4s, v6.4s\n"
+ "ldr s26, [x18, x27]\n"
+ "fmla v19.4s, v22.4s, v4.4s\n"
+ "ldr x19, [%[inptrs], 136]\n"
+ "fmla v3.4s, v27.4s, v10.4s\n"
+ "ldr s23, [x22, x27]\n"
+ "fmla v0.4s, v22.4s, v7.4s\n"
+ "ldr x22, [%[inptrs], 264]\n"
+ "fmla v1.4s, v22.4s, v5.4s\n"
+ "fmla v16.4s, v22.4s, v8.4s\n"
+ "str s19, [x25, x28]\n"
+ "fmla v15.4s, v22.4s, v6.4s\n"
+ "fmla v21.4s, v22.4s, v9.4s\n"
+ "ldr s27, [x21, x27]\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "ldr s28, [x20, x27]\n"
+ "fmla v1.4s, v30.4s, v7.4s\n"
+ "ldr x21, [%[inptrs], 224]\n"
+ "fmla v2.4s, v30.4s, v5.4s\n"
+ "ldr x20, [%[inptrs], 184]\n"
+ "fmla v16.4s, v30.4s, v10.4s\n"
+ "ldr x25, [%[outptrs], 72]\n"
+ "str s17, [x24, x28]\n"
+ "fmla v15.4s, v30.4s, v8.4s\n"
+ "fmla v18.4s, v30.4s, v6.4s\n"
+ "ldr s22, [x19, x27]\n"
+ "fmla v21.4s, v30.4s, v11.4s\n"
+ "ldr x24, [%[outptrs], 48]\n"
+ "fmla v24.4s, v30.4s, v9.4s\n"
+ "fmla v20.4s, v30.4s, v12.4s\n"
+ "fmla v14.4s, v31.4s, v4.4s\n"
+ "ldr s30, [x22, x27]\n"
+ "fmla v2.4s, v31.4s, v7.4s\n"
+ "ldr s19, [x21, x27]\n"
+ "fmla v3.4s, v31.4s, v5.4s\n"
+ "ldr x22, [%[inptrs], 272]\n"
+ "fmla v15.4s, v31.4s, v10.4s\n"
+ "ldr x21, [%[inptrs], 232]\n"
+ "str s14, [x23, x28]\n"
+ "fmla v18.4s, v31.4s, v8.4s\n"
+ "fmla v24.4s, v31.4s, v11.4s\n"
+ "ldr s31, [x20, x27]\n"
+ "fmla v3.4s, v26.4s, v7.4s\n"
+ "ldr s17, [x22, x27]\n"
+ "fmla v0.4s, v23.4s, v4.4s\n"
+ "ldr x22, [%[inptrs], 280]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr s14, [x21, x27]\n"
+ "fmla v16.4s, v23.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 24]\n"
+ "fmla v21.4s, v23.4s, v6.4s\n"
+ "ldr s26, [x22, x27]\n"
+ "str s0, [x26, x28]\n"
+ "fmla v1.4s, v27.4s, v4.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 104]\n"
+ "fmla v16.4s, v27.4s, v7.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v21.4s, v27.4s, v8.4s\n"
+ "fmla v24.4s, v27.4s, v6.4s\n"
+ "str s1, [x25, x28]\n"
+ "fmla v20.4s, v27.4s, v9.4s\n"
+ "fmla v2.4s, v28.4s, v4.4s\n"
+ "ldr x25, [%[outptrs], 80]\n"
+ "fmla v15.4s, v28.4s, v7.4s\n"
+ "fmla v18.4s, v28.4s, v5.4s\n"
+ "fmla v21.4s, v28.4s, v10.4s\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "fmla v20.4s, v28.4s, v11.4s\n"
+ "fmla v3.4s, v22.4s, v4.4s\n"
+ "str s2, [x24, x28]\n"
+ "fmla v16.4s, v30.4s, v4.4s\n"
+ "fmla v18.4s, v22.4s, v7.4s\n"
+ "ldr x24, [%[outptrs], 56]\n"
+ "fmla v24.4s, v22.4s, v10.4s\n"
+ "fmla v21.4s, v30.4s, v5.4s\n"
+ "str s3, [x23, x28]\n"
+ "fmla v20.4s, v30.4s, v6.4s\n"
+ "str s16, [x26, x28]\n"
+ "fmla v15.4s, v19.4s, v4.4s\n"
+ "fmla v18.4s, v31.4s, v4.4s\n"
+ "ldr x26, [%[outptrs], 112]\n"
+ "fmla v21.4s, v19.4s, v7.4s\n"
+ "fmla v24.4s, v19.4s, v5.4s\n"
+ "fmla v20.4s, v19.4s, v8.4s\n"
+ "str s15, [x25, x28]\n"
+ "str s18, [x24, x28]\n"
+ "ldr x25, [%[outptrs], 88]\n"
+ "fmla v24.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v17.4s, v4.4s\n"
+ "fmla v20.4s, v31.4s, v10.4s\n"
+ "str s21, [x26, x28]\n"
+ "fmla v20.4s, v17.4s, v5.4s\n"
+ "ldr x26, [%[outptrs], 120]\n"
+ "fmla v24.4s, v14.4s, v4.4s\n"
+ "fmla v20.4s, v14.4s, v7.4s\n"
+ "str s24, [x25, x28]\n"
+ "fmla v20.4s, v26.4s, v4.4s\n"
+ "str s20, [x26, x28]\n"
+ "add x28, x28, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
- asm volatile (
- "qW22 .req q0\n" "vW22 .req v0\n"
- "qU64 .req q1\n" "qU35 .req q1\n" "qV41 .req q1\n"
- "vU64 .req v1\n" "vU35 .req v1\n" "vV41 .req v1\n"
- "qU34 .req q2\n" "qU21 .req q2\n" "qV43 .req q2\n"
- "vU34 .req v2\n" "vU21 .req v2\n" "vV43 .req v2\n"
- "qW21 .req q3\n" "vW21 .req v3\n"
- "qU24 .req q4\n" "qU54 .req q4\n" "qV31 .req q4\n"
- "vU24 .req v4\n" "vU54 .req v4\n" "vV31 .req v4\n"
- "qV12 .req q5\n" "qU61 .req q5\n" "vV12 .req v5\n" "vU61 .req v5\n"
- "qU26 .req q6\n" "qV32 .req q6\n" "vU26 .req v6\n" "vV32 .req v6\n"
- "qU36 .req q7\n" "qU51 .req q7\n" "qU66 .req q7\n" "qU12 .req q7\n"
- "vU36 .req v7\n" "vU51 .req v7\n" "vU66 .req v7\n" "vU12 .req v7\n"
- "qV14 .req q8\n" "qV11 .req q8\n" "qU65 .req q8\n"
- "vV14 .req v8\n" "vV11 .req v8\n" "vU65 .req v8\n"
- "qU15 .req q9\n" "qU22 .req q9\n" "qU45 .req q9\n"
- "vU15 .req v9\n" "vU22 .req v9\n" "vU45 .req v9\n"
- "qV22 .req q10\n" "qU14 .req q10\n" "vV22 .req v10\n" "vU14 .req v10\n"
- "qU44 .req q11\n" "qU43 .req q11\n" "qU11 .req q11\n"
- "vU44 .req v11\n" "vU43 .req v11\n" "vU11 .req v11\n"
- "qV24 .req q12\n" "qV42 .req q12\n" "vV24 .req v12\n" "vV42 .req v12\n"
- "qW31 .req q13\n" "vW31 .req v13\n" "qW13 .req q14\n" "vW13 .req v14\n"
- "qU33 .req q15\n" "qU62 .req q15\n" "qU25 .req q15\n" "qU56 .req q15\n"
- "vU33 .req v15\n" "vU62 .req v15\n" "vU25 .req v15\n" "vU56 .req v15\n"
- "qW33 .req q16\n" "vW33 .req v16\n"
- "qU42 .req q17\n" "qU16 .req q17\n" "qV44 .req q17\n"
- "vU42 .req v17\n" "vU16 .req v17\n" "vV44 .req v17\n"
- "qU63 .req q18\n" "qU31 .req q18\n" "qV34 .req q18\n"
- "vU63 .req v18\n" "vU31 .req v18\n" "vV34 .req v18\n"
- "qW11 .req q19\n" "vW11 .req v19\n" "qU41 .req q20\n" "qV13 .req q20\n"
- "vU41 .req v20\n" "vV13 .req v20\n" "qV33 .req q21\n" "vV33 .req v21\n"
- "qU46 .req q22\n" "qU32 .req q22\n" "qU13 .req q22\n"
- "vU46 .req v22\n" "vU32 .req v22\n" "vU13 .req v22\n" "qW23 .req q23\n"
- "vW23 .req v23\n" "qV23 .req q24\n" "vV23 .req v24\n"
- "qV21 .req q25\n" "qU55 .req q25\n" "vV21 .req v25\n" "vU55 .req v25\n"
- "qW12 .req q26\n" "vW12 .req v26\n" "qW32 .req q27\n" "vW32 .req v27\n"
- "qU23 .req q28\n" "qU52 .req q28\n"
- "vU23 .req v28\n" "vU52 .req v28\n" "qU53 .req q29\n" "vU53 .req v29\n"
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
+)
+{
+ __asm __volatile(
+ "add x9, %[inptr0], %[input_row_stride]\n"
+ "add x28, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x16, %[outptr0], %[output_row_stride]\n"
+ "add x24, x9, %[input_row_stride]\n"
+ "add x25, x28, #64\n"
+ "add x23, x28, %[input_col_stride1]\n"
+ "add x26, x24, %[input_row_stride]\n"
+ "add x11, x23, #64\n"
+ "add x12, x23, %[input_col_stride1]\n"
+ "add x10, x26, %[input_row_stride]\n"
+ "add x13, x12, #64\n"
+ "add x14, x12, %[input_col_stride1]\n"
+ "add x27, x10, %[input_row_stride]\n"
+ "add x15, x14, #64\n"
+ "add x17, x16, %[output_row_stride]\n"
+ "add x18, x17, %[output_row_stride]\n"
+ "add x19, %[output_col_stride1], %[output_col_stride1]\n"
+ "and x21, %[n_channels], #3\n"
+ "add x20, x19, %[output_col_stride1]\n"
+ "lsr x22, %[n_channels], #2\n"
+ "cbz x22, 4f\n"
+ "1:\n"
+ "ldr q21, [%[wbptr]]\n"
+ "subs x22, x22, #1\n"
+ "mov v7.16b, v21.16b\n"
+ "ldr q20, [%[wbptr], #16]\n"
+ "mov v3.16b, v21.16b\n"
+ "ldr q14, [%[wbptr], #32]\n"
+ "mov v6.16b, v21.16b\n"
+ "ldr q13, [%[wbptr], #48]\n"
+ "mov v15.16b, v21.16b\n"
+ "ldr q17, [%[wbptr], #64]\n"
+ "mov v2.16b, v21.16b\n"
+ "ldr q12, [%[wbptr], #80]\n"
+ "mov v5.16b, v21.16b\n"
+ "ldr q11, [%[wbptr], #96]\n"
+ "mov v0.16b, v21.16b\n"
+ "ldr q10, [%[wbptr], #112]\n"
+ "mov v16.16b, v21.16b\n"
+ "ldr q9, [%[wbptr], #128]\n"
+ "mov v1.16b, v21.16b\n"
+ "ldr q8, [%[wbptr], #144]\n"
+ "mov v4.16b, v21.16b\n"
+ "ldr q22, [%[inptr0]]\n"
+ "fmla v7.4s, v22.4s, v20.4s\n"
+ "ldr q19, [x9]\n"
+ "fmla v3.4s, v19.4s, v20.4s\n"
+ "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v6.4s, v23.4s, v20.4s\n"
+ "ldr q18, [x24]\n"
+ "fmla v7.4s, v19.4s, v17.4s\n"
+ "ldr q27, [x9, %[input_col_stride1]]\n"
+ "fmla v3.4s, v18.4s, v17.4s\n"
+ "ldr q28, [%[inptr0], x28]\n"
+ "fmla v15.4s, v18.4s, v20.4s\n"
+ "ldr q25, [x26]\n"
+ "fmla v7.4s, v23.4s, v14.4s\n"
+ "ldr q22, [x24, %[input_col_stride1]]\n"
+ "fmla v3.4s, v27.4s, v14.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "prfm pldl1keep, [%[inptr0], x8]\n"
+ "fmla v7.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "prfm pldl1keep, [x9, x8]\n"
+ "prfm pldl1keep, [%[inptr0], x25]\n"
+ "prfm pldl1keep, [x26, #64]\n"
+ "prfm pldl1keep, [x24, x8]\n"
+ "fmla v7.4s, v27.4s, v12.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "mov v18.16b, v21.16b\n"
+ "ldr q23, [x9, x28]\n"
+ "mov v19.16b, v21.16b\n"
+ "prfm pldl1keep, [x9, x25]\n"
+ "fmla v6.4s, v27.4s, v17.4s\n"
+ "prfm pldl1keep, [%[inptr0], x11]\n"
+ "fmla v2.4s, v27.4s, v20.4s\n"
+ "ldr q24, [%[inptr0], x23]\n"
+ "fmla v7.4s, v28.4s, v13.4s\n"
+ "prfm pldl1keep, [x10, #64]\n"
+ "fmla v6.4s, v28.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x8]\n"
+ "fmla v5.4s, v28.4s, v20.4s\n"
+ "ldr q26, [x10]\n"
+ "fmla v3.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x25]\n"
+ "fmla v15.4s, v25.4s, v17.4s\n"
+ "prfm pldl1keep, [x9, x11]\n"
+ "fmla v0.4s, v25.4s, v20.4s\n"
+ "ldr q25, [x26, %[input_col_stride1]]\n"
+ "fmla v7.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [%[inptr0], x13]\n"
+ "fmla v3.4s, v22.4s, v12.4s\n"
+ "prfm pldl1keep, [x27, #64]\n"
+ "fmla v6.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [x10, x8]\n"
+ "fmla v15.4s, v22.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x25]\n"
+ "fmla v2.4s, v22.4s, v17.4s\n"
+ "prfm pldl1keep, [x24, x11]\n"
+ "fmla v16.4s, v22.4s, v20.4s\n"
+ "ldr q22, [x24, x28]\n"
+ "fmla v7.4s, v23.4s, v11.4s\n"
+ "prfm pldl1keep, [x9, x13]\n"
+ "fmla v3.4s, v23.4s, v13.4s\n"
+ "prfm pldl1keep, [%[inptr0], x15]\n"
+ "fmla v6.4s, v23.4s, v12.4s\n"
+ "prfm pldl1keep, [x27, x8]\n"
+ "fmla v2.4s, v23.4s, v14.4s\n"
+ "prfm pldl1keep, [x10, x25]\n"
+ "fmla v5.4s, v23.4s, v17.4s\n"
+ "prfm pldl1keep, [x26, x11]\n"
+ "fmla v1.4s, v23.4s, v20.4s\n"
+ "ldr q23, [x9, x23]\n"
+ "fmla v6.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [x24, x13]\n"
+ "fmla v5.4s, v24.4s, v14.4s\n"
+ "prfm pldl1keep, [x9, x15]\n"
+ "fmla v4.4s, v24.4s, v20.4s\n"
+ "ldr q24, [%[inptr0], x12]\n"
+ "fmla v15.4s, v26.4s, v10.4s\n"
+ "prfm pldl1keep, [x27, x25]\n"
+ "fmla v0.4s, v26.4s, v17.4s\n"
+ "ldr q29, [x27]\n"
+ "fmla v3.4s, v25.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x11]\n"
+ "fmla v15.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x26, x13]\n"
+ "fmla v2.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x15]\n"
+ "fmla v0.4s, v25.4s, v14.4s\n"
+ "prfm pldl1keep, [x27, x11]\n"
+ "fmla v16.4s, v25.4s, v17.4s\n"
+ "prfm pldl1keep, [x10, x13]\n"
+ "fmla v18.4s, v25.4s, v20.4s\n"
+ "ldr q26, [x10, %[input_col_stride1]]\n"
+ "fmla v7.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [x26, x15]\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "prfm pldl1keep, [x27, x13]\n"
+ "fmla v6.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x15]\n"
+ "fmla v15.4s, v22.4s, v13.4s\n"
+ "prfm pldl1keep, [x27, x15]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v5.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v22.4s, v14.4s\n"
+ "subs x22, x22, #1\n"
+ "fmla v1.4s, v22.4s, v17.4s\n"
+ "fmla v19.4s, v22.4s, v20.4s\n"
+ "mov v22.16b, v21.16b\n"
+ "fmla v6.4s, v23.4s, v11.4s\n"
+ "fmla v2.4s, v23.4s, v13.4s\n"
+ "fmla v5.4s, v23.4s, v12.4s\n"
+ "fmla v1.4s, v23.4s, v14.4s\n"
+ "fmla v4.4s, v23.4s, v17.4s\n"
+ "fmla v22.4s, v23.4s, v20.4s\n"
+ "ldr q27, [x26, x28]\n"
+ "fmla v5.4s, v24.4s, v13.4s\n"
+ "fmla v0.4s, v29.4s, v10.4s\n"
+ "mov v23.16b, v21.16b\n"
+ "fmla v4.4s, v24.4s, v14.4s\n"
+ "mov v25.16b, v21.16b\n"
+ "mov v24.16b, v21.16b\n"
+ "fmla v15.4s, v26.4s, v9.4s\n"
+ "fmla v0.4s, v26.4s, v12.4s\n"
+ "fmla v16.4s, v26.4s, v10.4s\n"
+ "fmla v18.4s, v26.4s, v17.4s\n"
+ "fmla v3.4s, v27.4s, v8.4s\n"
+ "ldr q29, [x24, x23]\n"
+ "fmla v15.4s, v27.4s, v11.4s\n"
+ "fmla v2.4s, v27.4s, v9.4s\n"
+ "fmla v0.4s, v27.4s, v13.4s\n"
+ "fmla v16.4s, v27.4s, v12.4s\n"
+ "fmla v1.4s, v27.4s, v10.4s\n"
+ "fmla v18.4s, v27.4s, v14.4s\n"
+ "fmla v19.4s, v27.4s, v17.4s\n"
+ "fmla v23.4s, v27.4s, v20.4s\n"
+ "fmla v6.4s, v29.4s, v8.4s\n"
+ "ldr q28, [x9, x12]\n"
+ "fmla v2.4s, v29.4s, v11.4s\n"
+ "fmla v5.4s, v29.4s, v9.4s\n"
+ "fmla v16.4s, v29.4s, v13.4s\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "fmla v4.4s, v29.4s, v10.4s\n"
+ "fmla v19.4s, v29.4s, v14.4s\n"
+ "fmla v22.4s, v29.4s, v17.4s\n"
+ "fmla v25.4s, v29.4s, v20.4s\n"
+ "fmla v5.4s, v28.4s, v11.4s\n"
+ "ldr q21, [%[inptr0], x14]\n"
+ "fmla v1.4s, v28.4s, v13.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v4.4s, v28.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v22.4s, v28.4s, v14.4s\n"
+ "ldr q26, [x27, %[input_col_stride1]]\n"
+ "fmla v0.4s, v26.4s, v9.4s\n"
+ "prfm pldl1keep, [%[inptr0], x8]\n"
+ "fmla v4.4s, v21.4s, v13.4s\n"
+ "ldr q21, [x10, x28]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr q29, [x26, x23]\n"
+ "fmla v15.4s, v21.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x25]\n"
+ "fmla v0.4s, v21.4s, v11.4s\n"
+ "fmla v16.4s, v21.4s, v9.4s\n"
+ "fmla v18.4s, v21.4s, v12.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v23.4s, v21.4s, v17.4s\n"
+ "ldr q21, [x24, x12]\n"
+ "fmla v2.4s, v29.4s, v8.4s\n"
+ "fmla v16.4s, v29.4s, v11.4s\n"
+ "fmla v1.4s, v29.4s, v9.4s\n"
+ "fmla v18.4s, v29.4s, v13.4s\n"
+ "fmla v19.4s, v29.4s, v12.4s\n"
+ "fmla v22.4s, v29.4s, v10.4s\n"
+ "fmla v23.4s, v29.4s, v14.4s\n"
+ "fmla v25.4s, v29.4s, v17.4s\n"
+ "fmla v24.4s, v29.4s, v20.4s\n"
+ "ldr q28, [x9, x14]\n"
+ "fmla v5.4s, v21.4s, v8.4s\n"
+ "ldr q27, [x27, x28]\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "add x9, x9, #16\n"
+ "fmla v4.4s, v21.4s, v9.4s\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "fmla v19.4s, v21.4s, v13.4s\n"
+ "prfm pldl1keep, [x9, x8]\n"
+ "fmla v22.4s, v21.4s, v12.4s\n"
+ "fmla v25.4s, v21.4s, v14.4s\n"
+ "fmla v4.4s, v28.4s, v11.4s\n"
+ "ldr q20, [x10, x23]\n"
+ "fmla v0.4s, v27.4s, v8.4s\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "fmla v22.4s, v28.4s, v13.4s\n"
+ "ldr q26, [x26, x12]\n"
+ "fmla v23.4s, v27.4s, v10.4s\n"
+ "ldr q21, [x24, x14]\n"
+ "fmla v16.4s, v20.4s, v8.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v18.4s, v20.4s, v11.4s\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "fmla v19.4s, v20.4s, v9.4s\n"
+ "prfm pldl1keep, [x24, x8]\n"
+ "fmla v23.4s, v20.4s, v12.4s\n"
+ "fmla v25.4s, v20.4s, v10.4s\n"
+ "fmla v24.4s, v20.4s, v17.4s\n"
+ "ldr q28, [x27, x23]\n"
+ "fmla v1.4s, v26.4s, v8.4s\n"
+ "ldr q20, [x10, x12]\n"
+ "fmla v19.4s, v26.4s, v11.4s\n"
+ "fmla v22.4s, v26.4s, v9.4s\n"
+ "fmla v23.4s, v26.4s, v13.4s\n"
+ "fmla v25.4s, v26.4s, v12.4s\n"
+ "fmla v24.4s, v26.4s, v14.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v4.4s, v21.4s, v8.4s\n"
+ "ldr q26, [x27, x12]\n"
+ "fmla v22.4s, v21.4s, v11.4s\n"
+ "add x26, x26, #16\n"
+ "fmla v25.4s, v21.4s, v13.4s\n"
+ "ldr q27, [x10, x14]\n"
+ "fmla v18.4s, v28.4s, v8.4s\n"
+ "prfm pldl1keep, [x26, #64]\n"
+ "fmla v23.4s, v28.4s, v9.4s\n"
+ "add x10, x10, #16\n"
+ "fmla v24.4s, v28.4s, v10.4s\n"
+ "ldr q28, [x27, x14]\n"
+ "fmla v19.4s, v20.4s, v8.4s\n"
+ "ldr q21, [%[wbptr]]\n"
+ "fmla v23.4s, v20.4s, v11.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v25.4s, v20.4s, v9.4s\n"
+ "fmla v24.4s, v20.4s, v12.4s\n"
+ "fmla v22.4s, v17.4s, v8.4s\n"
+ "ldr q20, [%[wbptr], #16]\n"
+ "fmla v23.4s, v26.4s, v8.4s\n"
+ "ldr q14, [%[wbptr], #32]\n"
+ "fmla v24.4s, v17.4s, v13.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v25.4s, v17.4s, v11.4s\n"
+ "ldr q17, [%[wbptr], #64]\n"
+ "fmax v7.4s, v7.4s, v29.4s\n"
+ "fmax v6.4s, v6.4s, v29.4s\n"
+ "fmla v24.4s, v26.4s, v9.4s\n"
+ "ldr q13, [%[wbptr], #48]\n"
+ "str q7, [%[outptr0]]\n"
+ "fmla v25.4s, v27.4s, v8.4s\n"
+ "str q6, [%[outptr0], %[output_col_stride1]]\n"
+ "fmax v5.4s, v5.4s, v29.4s\n"
+ "fmla v24.4s, v27.4s, v11.4s\n"
+ "ldr q12, [%[wbptr], #80]\n"
+ "str q5, [%[outptr0], x19]\n"
+ "fmax v4.4s, v4.4s, v29.4s\n"
+ "fmax v3.4s, v3.4s, v29.4s\n"
+ "ldr q10, [%[wbptr], #112]\n"
+ "str q4, [%[outptr0], x20]\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "str q3, [x16]\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "ldr q11, [%[wbptr], #96]\n"
+ "str q2, [x16, %[output_col_stride1]]\n"
+ "fmax v22.4s, v22.4s, v29.4s\n"
+ "str q1, [x16, x19]\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str q22, [x16, x20]\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "str q15, [x17]\n"
+ "fmax v19.4s, v19.4s, v29.4s\n"
+ "str q16, [x17, %[output_col_stride1]]\n"
+ "fmax v25.4s, v25.4s, v29.4s\n"
+ "str q19, [x17, x19]\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "str q25, [x17, x20]\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "str q0, [x18]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "str q18, [x18, %[output_col_stride1]]\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "str q23, [x18, x19]\n"
+ "mov v7.16b, v21.16b\n"
+ "str q24, [x18, x20]\n"
+ "mov v3.16b, v21.16b\n"
+ "mov v6.16b, v21.16b\n"
+ "ldr q9, [%[wbptr], #128]\n"
+ "mov v15.16b, v21.16b\n"
+ "ldr q8, [%[wbptr], #144]\n"
+ "mov v2.16b, v21.16b\n"
+ "ldr q22, [%[inptr0]]\n"
+ "mov v5.16b, v21.16b\n"
+ "ldr q19, [x9]\n"
+ "mov v0.16b, v21.16b\n"
+ "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
+ "mov v16.16b, v21.16b\n"
+ "ldr q18, [x24]\n"
+ "mov v1.16b, v21.16b\n"
+ "ldr q27, [x9, %[input_col_stride1]]\n"
+ "mov v4.16b, v21.16b\n"
+ "ldr q28, [%[inptr0], x28]\n"
+ "fmla v7.4s, v22.4s, v20.4s\n"
+ "ldr q25, [x26]\n"
+ "fmla v3.4s, v19.4s, v20.4s\n"
+ "ldr q22, [x24, %[input_col_stride1]]\n"
+ "fmla v6.4s, v23.4s, v20.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmla v7.4s, v19.4s, v17.4s\n"
+ "add x16, x16, #16\n"
+ "fmla v3.4s, v18.4s, v17.4s\n"
+ "add x17, x17, #16\n"
+ "fmla v15.4s, v18.4s, v20.4s\n"
+ "add x18, x18, #16\n"
+ "fmla v7.4s, v23.4s, v14.4s\n"
+ "fmla v3.4s, v27.4s, v14.4s\n"
+ "fmla v7.4s, v18.4s, v10.4s\n"
+ "fmla v7.4s, v27.4s, v12.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "mov v18.16b, v21.16b\n"
+ "ldr q23, [x9, x28]\n"
+ "mov v19.16b, v21.16b\n"
+ "prfm pldl1keep, [x9, x25]\n"
+ "fmla v6.4s, v27.4s, v17.4s\n"
+ "prfm pldl1keep, [%[inptr0], x11]\n"
+ "fmla v2.4s, v27.4s, v20.4s\n"
+ "ldr q24, [%[inptr0], x23]\n"
+ "fmla v7.4s, v28.4s, v13.4s\n"
+ "prfm pldl1keep, [x10, #64]\n"
+ "fmla v6.4s, v28.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x8]\n"
+ "fmla v5.4s, v28.4s, v20.4s\n"
+ "ldr q26, [x10]\n"
+ "fmla v3.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x25]\n"
+ "fmla v15.4s, v25.4s, v17.4s\n"
+ "prfm pldl1keep, [x9, x11]\n"
+ "fmla v0.4s, v25.4s, v20.4s\n"
+ "ldr q25, [x26, %[input_col_stride1]]\n"
+ "fmla v7.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [%[inptr0], x13]\n"
+ "fmla v3.4s, v22.4s, v12.4s\n"
+ "prfm pldl1keep, [x27, #64]\n"
+ "fmla v6.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [x10, x8]\n"
+ "fmla v15.4s, v22.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x25]\n"
+ "fmla v2.4s, v22.4s, v17.4s\n"
+ "prfm pldl1keep, [x24, x11]\n"
+ "fmla v16.4s, v22.4s, v20.4s\n"
+ "ldr q22, [x24, x28]\n"
+ "fmla v7.4s, v23.4s, v11.4s\n"
+ "prfm pldl1keep, [x9, x13]\n"
+ "fmla v3.4s, v23.4s, v13.4s\n"
+ "prfm pldl1keep, [%[inptr0], x15]\n"
+ "fmla v6.4s, v23.4s, v12.4s\n"
+ "prfm pldl1keep, [x27, x8]\n"
+ "fmla v2.4s, v23.4s, v14.4s\n"
+ "prfm pldl1keep, [x10, x25]\n"
+ "fmla v5.4s, v23.4s, v17.4s\n"
+ "prfm pldl1keep, [x26, x11]\n"
+ "fmla v1.4s, v23.4s, v20.4s\n"
+ "ldr q23, [x9, x23]\n"
+ "fmla v6.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [x24, x13]\n"
+ "fmla v5.4s, v24.4s, v14.4s\n"
+ "prfm pldl1keep, [x9, x15]\n"
+ "fmla v4.4s, v24.4s, v20.4s\n"
+ "ldr q24, [%[inptr0], x12]\n"
+ "fmla v15.4s, v26.4s, v10.4s\n"
+ "prfm pldl1keep, [x27, x25]\n"
+ "fmla v0.4s, v26.4s, v17.4s\n"
+ "ldr q29, [x27]\n"
+ "fmla v3.4s, v25.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x11]\n"
+ "fmla v15.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x26, x13]\n"
+ "fmla v2.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x15]\n"
+ "fmla v0.4s, v25.4s, v14.4s\n"
+ "prfm pldl1keep, [x27, x11]\n"
+ "fmla v16.4s, v25.4s, v17.4s\n"
+ "prfm pldl1keep, [x10, x13]\n"
+ "fmla v18.4s, v25.4s, v20.4s\n"
+ "ldr q26, [x10, %[input_col_stride1]]\n"
+ "fmla v7.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [x26, x15]\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "prfm pldl1keep, [x27, x13]\n"
+ "fmla v6.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x15]\n"
+ "fmla v15.4s, v22.4s, v13.4s\n"
+ "prfm pldl1keep, [x27, x15]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v5.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v22.4s, v14.4s\n"
+ "fmla v1.4s, v22.4s, v17.4s\n"
+ "fmla v19.4s, v22.4s, v20.4s\n"
+ "ldr q27, [x26, x28]\n"
+ "fmla v6.4s, v23.4s, v11.4s\n"
+ "fmla v2.4s, v23.4s, v13.4s\n"
+ "fmla v5.4s, v23.4s, v12.4s\n"
+ "fmla v1.4s, v23.4s, v14.4s\n"
+ "fmla v4.4s, v23.4s, v17.4s\n"
+ "fmla v0.4s, v29.4s, v10.4s\n"
+ "mov v22.16b, v21.16b\n"
+ "fmla v15.4s, v26.4s, v9.4s\n"
+ "fmla v5.4s, v24.4s, v13.4s\n"
+ "fmla v16.4s, v26.4s, v10.4s\n"
+ "fmla v22.4s, v23.4s, v20.4s\n"
+ "ldr q29, [x24, x23]\n"
+ "fmla v4.4s, v24.4s, v14.4s\n"
+ "ldr q28, [x9, x12]\n"
+ "fmla v0.4s, v26.4s, v12.4s\n"
+ "fmla v18.4s, v26.4s, v17.4s\n"
+ "mov v23.16b, v21.16b\n"
+ "fmla v3.4s, v27.4s, v8.4s\n"
+ "fmla v15.4s, v27.4s, v11.4s\n"
+ "fmla v2.4s, v27.4s, v9.4s\n"
+ "fmla v0.4s, v27.4s, v13.4s\n"
+ "fmla v16.4s, v27.4s, v12.4s\n"
+ "fmla v1.4s, v27.4s, v10.4s\n"
+ "fmla v18.4s, v27.4s, v14.4s\n"
+ "fmla v19.4s, v27.4s, v17.4s\n"
+ "fmla v23.4s, v27.4s, v20.4s\n"
+ "mov v25.16b, v21.16b\n"
+ "mov v24.16b, v21.16b\n"
+ "fmla v6.4s, v29.4s, v8.4s\n"
+ "fmla v2.4s, v29.4s, v11.4s\n"
+ "fmla v5.4s, v29.4s, v9.4s\n"
+ "fmla v16.4s, v29.4s, v13.4s\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "fmla v4.4s, v29.4s, v10.4s\n"
+ "fmla v19.4s, v29.4s, v14.4s\n"
+ "fmla v22.4s, v29.4s, v17.4s\n"
+ "fmla v25.4s, v29.4s, v20.4s\n"
+ "ldr q21, [%[inptr0], x14]\n"
+ "fmla v5.4s, v28.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v1.4s, v28.4s, v13.4s\n"
+ "fmla v4.4s, v28.4s, v12.4s\n"
+ "fmla v22.4s, v28.4s, v14.4s\n"
+ "ldr q26, [x27, %[input_col_stride1]]\n"
+ "fmla v0.4s, v26.4s, v9.4s\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "fmla v4.4s, v21.4s, v13.4s\n"
+ "ldr q21, [x10, x28]\n"
+ "fmla v15.4s, v21.4s, v8.4s\n"
+ "ldr q29, [x26, x23]\n"
+ "fmla v0.4s, v21.4s, v11.4s\n"
+ "fmla v16.4s, v21.4s, v9.4s\n"
+ "fmla v18.4s, v21.4s, v12.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v23.4s, v21.4s, v17.4s\n"
+ "ldr q21, [x24, x12]\n"
+ "fmla v2.4s, v29.4s, v8.4s\n"
+ "fmla v16.4s, v29.4s, v11.4s\n"
+ "fmla v1.4s, v29.4s, v9.4s\n"
+ "fmla v18.4s, v29.4s, v13.4s\n"
+ "fmla v19.4s, v29.4s, v12.4s\n"
+ "fmla v22.4s, v29.4s, v10.4s\n"
+ "fmla v23.4s, v29.4s, v14.4s\n"
+ "fmla v25.4s, v29.4s, v17.4s\n"
+ "fmla v24.4s, v29.4s, v20.4s\n"
+ "ldr q28, [x9, x14]\n"
+ "fmla v5.4s, v21.4s, v8.4s\n"
+ "ldr q27, [x27, x28]\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "add x9, x9, #16\n"
+ "fmla v4.4s, v21.4s, v9.4s\n"
+ "fmla v19.4s, v21.4s, v13.4s\n"
+ "fmla v22.4s, v21.4s, v12.4s\n"
+ "fmla v25.4s, v21.4s, v14.4s\n"
+ "fmla v0.4s, v27.4s, v8.4s\n"
+ "ldr q20, [x10, x23]\n"
+ "fmla v4.4s, v28.4s, v11.4s\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "fmla v22.4s, v28.4s, v13.4s\n"
+ "ldr q26, [x26, x12]\n"
+ "fmla v23.4s, v27.4s, v10.4s\n"
+ "ldr q21, [x24, x14]\n"
+ "fmla v16.4s, v20.4s, v8.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v18.4s, v20.4s, v11.4s\n"
+ "fmla v19.4s, v20.4s, v9.4s\n"
+ "fmla v23.4s, v20.4s, v12.4s\n"
+ "fmla v25.4s, v20.4s, v10.4s\n"
+ "fmla v24.4s, v20.4s, v17.4s\n"
+ "ldr q28, [x27, x23]\n"
+ "fmla v1.4s, v26.4s, v8.4s\n"
+ "ldr q20, [x10, x12]\n"
+ "fmla v19.4s, v26.4s, v11.4s\n"
+ "fmla v22.4s, v26.4s, v9.4s\n"
+ "fmla v23.4s, v26.4s, v13.4s\n"
+ "fmla v25.4s, v26.4s, v12.4s\n"
+ "fmla v24.4s, v26.4s, v14.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v4.4s, v21.4s, v8.4s\n"
+ "ldr q26, [x27, x12]\n"
+ "fmla v22.4s, v21.4s, v11.4s\n"
+ "add x26, x26, #16\n"
+ "fmla v25.4s, v21.4s, v13.4s\n"
+ "ldr q27, [x10, x14]\n"
+ "fmla v18.4s, v28.4s, v8.4s\n"
+ "add x10, x10, #16\n"
+ "fmla v23.4s, v28.4s, v9.4s\n"
+ "fmla v24.4s, v28.4s, v10.4s\n"
+ "fmla v19.4s, v20.4s, v8.4s\n"
+ "ldr q28, [x27, x14]\n"
+ "fmla v25.4s, v20.4s, v9.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v23.4s, v20.4s, v11.4s\n"
+ "fmla v24.4s, v20.4s, v12.4s\n"
+ "fmla v22.4s, v17.4s, v8.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v25.4s, v17.4s, v11.4s\n"
+ "fmla v24.4s, v17.4s, v13.4s\n"
+ "fmla v23.4s, v26.4s, v8.4s\n"
+ "fmax v7.4s, v7.4s, v29.4s\n"
+ "fmla v25.4s, v27.4s, v8.4s\n"
+ "fmax v6.4s, v6.4s, v29.4s\n"
+ "str q7, [%[outptr0]]\n"
+ "fmla v24.4s, v26.4s, v9.4s\n"
+ "str q6, [%[outptr0], %[output_col_stride1]]\n"
+ "fmax v5.4s, v5.4s, v29.4s\n"
+ "fmax v4.4s, v4.4s, v29.4s\n"
+ "fmax v3.4s, v3.4s, v29.4s\n"
+ "str q5, [%[outptr0], x19]\n"
+ "fmla v24.4s, v27.4s, v11.4s\n"
+ "str q4, [%[outptr0], x20]\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "str q3, [x16]\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "str q2, [x16, %[output_col_stride1]]\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "str q1, [x16, x19]\n"
+ "fmax v22.4s, v22.4s, v29.4s\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "str q22, [x16, x20]\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "str q15, [x17]\n"
+ "fmax v19.4s, v19.4s, v29.4s\n"
+ "str q16, [x17, %[output_col_stride1]]\n"
+ "fmax v25.4s, v25.4s, v29.4s\n"
+ "str q19, [x17, x19]\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "str q25, [x17, x20]\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "str q0, [x18]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "str q18, [x18, %[output_col_stride1]]\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "str q23, [x18, x19]\n"
+ "add x16, x16, #16\n"
+ "str q24, [x18, x20]\n"
+ "add x17, x17, #16\n"
+ "add x18, x18, #16\n"
+ "4:\n"
+ "cbz x21, 7f\n"
+ "ldr s21, [%[wbptr]]\n"
+ "mov v7.16b, v21.16b\n"
+ "ldr s20, [%[wbptr], #4]\n"
+ "mov v3.16b, v21.16b\n"
+ "ldr s14, [%[wbptr], #8]\n"
+ "mov v6.16b, v21.16b\n"
+ "ldr s13, [%[wbptr], #12]\n"
+ "mov v15.16b, v21.16b\n"
+ "ldr s17, [%[wbptr], #16]\n"
+ "mov v2.16b, v21.16b\n"
+ "ldr s12, [%[wbptr], #20]\n"
+ "mov v5.16b, v21.16b\n"
+ "ldr s11, [%[wbptr], #24]\n"
+ "mov v0.16b, v21.16b\n"
+ "ldr s10, [%[wbptr], #28]\n"
+ "mov v16.16b, v21.16b\n"
+ "ldr s9, [%[wbptr], #32]\n"
+ "mov v1.16b, v21.16b\n"
+ "ldr s8, [%[wbptr], #36]\n"
+ "mov v4.16b, v21.16b\n"
+ "ldr s22, [%[inptr0]]\n"
+ "fmla v7.4s, v22.4s, v20.4s\n"
+ "ldr s19, [x9]\n"
+ "fmla v3.4s, v19.4s, v20.4s\n"
+ "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v6.4s, v23.4s, v20.4s\n"
+ "ldr s18, [x24]\n"
+ "fmla v7.4s, v19.4s, v17.4s\n"
+ "ldr s27, [x9, %[input_col_stride1]]\n"
+ "fmla v3.4s, v18.4s, v17.4s\n"
+ "ldr s28, [%[inptr0], x28]\n"
+ "fmla v15.4s, v18.4s, v20.4s\n"
+ "ldr s25, [x26]\n"
+ "fmla v7.4s, v23.4s, v14.4s\n"
+ "ldr s22, [x24, %[input_col_stride1]]\n"
+ "fmla v3.4s, v27.4s, v14.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "subs x21, x21, #1\n"
+ "prfm pldl1keep, [%[inptr0], x8]\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "fmla v7.4s, v18.4s, v10.4s\n"
+ "prfm pldl1keep, [x9, x8]\n"
+ "prfm pldl1keep, [%[inptr0], x25]\n"
+ "prfm pldl1keep, [x26, #64]\n"
+ "prfm pldl1keep, [x24, x8]\n"
+ "fmla v7.4s, v27.4s, v12.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "mov v18.16b, v21.16b\n"
+ "ldr s23, [x9, x28]\n"
+ "mov v19.16b, v21.16b\n"
+ "prfm pldl1keep, [x9, x25]\n"
+ "fmla v6.4s, v27.4s, v17.4s\n"
+ "prfm pldl1keep, [%[inptr0], x11]\n"
+ "fmla v2.4s, v27.4s, v20.4s\n"
+ "ldr s24, [%[inptr0], x23]\n"
+ "fmla v7.4s, v28.4s, v13.4s\n"
+ "prfm pldl1keep, [x10, #64]\n"
+ "fmla v6.4s, v28.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x8]\n"
+ "fmla v5.4s, v28.4s, v20.4s\n"
+ "ldr s26, [x10]\n"
+ "fmla v3.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x25]\n"
+ "fmla v15.4s, v25.4s, v17.4s\n"
+ "prfm pldl1keep, [x9, x11]\n"
+ "fmla v0.4s, v25.4s, v20.4s\n"
+ "ldr s25, [x26, %[input_col_stride1]]\n"
+ "fmla v7.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [%[inptr0], x13]\n"
+ "fmla v3.4s, v22.4s, v12.4s\n"
+ "prfm pldl1keep, [x27, #64]\n"
+ "fmla v6.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [x10, x8]\n"
+ "fmla v15.4s, v22.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x25]\n"
+ "fmla v2.4s, v22.4s, v17.4s\n"
+ "prfm pldl1keep, [x24, x11]\n"
+ "fmla v16.4s, v22.4s, v20.4s\n"
+ "ldr s22, [x24, x28]\n"
+ "fmla v7.4s, v23.4s, v11.4s\n"
+ "prfm pldl1keep, [x9, x13]\n"
+ "fmla v3.4s, v23.4s, v13.4s\n"
+ "prfm pldl1keep, [%[inptr0], x15]\n"
+ "fmla v6.4s, v23.4s, v12.4s\n"
+ "prfm pldl1keep, [x27, x8]\n"
+ "fmla v2.4s, v23.4s, v14.4s\n"
+ "prfm pldl1keep, [x10, x25]\n"
+ "fmla v5.4s, v23.4s, v17.4s\n"
+ "prfm pldl1keep, [x26, x11]\n"
+ "fmla v1.4s, v23.4s, v20.4s\n"
+ "ldr s23, [x9, x23]\n"
+ "fmla v6.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [x24, x13]\n"
+ "fmla v5.4s, v24.4s, v14.4s\n"
+ "prfm pldl1keep, [x9, x15]\n"
+ "fmla v4.4s, v24.4s, v20.4s\n"
+ "ldr s24, [%[inptr0], x12]\n"
+ "fmla v15.4s, v26.4s, v10.4s\n"
+ "prfm pldl1keep, [x27, x25]\n"
+ "fmla v0.4s, v26.4s, v17.4s\n"
+ "ldr s29, [x27]\n"
+ "fmla v3.4s, v25.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x11]\n"
+ "fmla v15.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x26, x13]\n"
+ "fmla v2.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x15]\n"
+ "fmla v0.4s, v25.4s, v14.4s\n"
+ "prfm pldl1keep, [x27, x11]\n"
+ "fmla v16.4s, v25.4s, v17.4s\n"
+ "prfm pldl1keep, [x10, x13]\n"
+ "fmla v18.4s, v25.4s, v20.4s\n"
+ "ldr s26, [x10, %[input_col_stride1]]\n"
+ "fmla v7.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [x26, x15]\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "prfm pldl1keep, [x27, x13]\n"
+ "fmla v6.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x15]\n"
+ "fmla v15.4s, v22.4s, v13.4s\n"
+ "prfm pldl1keep, [x27, x15]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v5.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v22.4s, v14.4s\n"
+ "subs x21, x21, #1\n"
+ "fmla v1.4s, v22.4s, v17.4s\n"
+ "fmla v19.4s, v22.4s, v20.4s\n"
+ "mov v22.16b, v21.16b\n"
+ "fmla v6.4s, v23.4s, v11.4s\n"
+ "fmla v2.4s, v23.4s, v13.4s\n"
+ "fmla v5.4s, v23.4s, v12.4s\n"
+ "fmla v1.4s, v23.4s, v14.4s\n"
+ "fmla v4.4s, v23.4s, v17.4s\n"
+ "fmla v22.4s, v23.4s, v20.4s\n"
+ "ldr s27, [x26, x28]\n"
+ "fmla v5.4s, v24.4s, v13.4s\n"
+ "fmla v0.4s, v29.4s, v10.4s\n"
+ "mov v23.16b, v21.16b\n"
+ "fmla v4.4s, v24.4s, v14.4s\n"
+ "mov v25.16b, v21.16b\n"
+ "mov v24.16b, v21.16b\n"
+ "fmla v15.4s, v26.4s, v9.4s\n"
+ "fmla v0.4s, v26.4s, v12.4s\n"
+ "fmla v16.4s, v26.4s, v10.4s\n"
+ "fmla v18.4s, v26.4s, v17.4s\n"
+ "fmla v3.4s, v27.4s, v8.4s\n"
+ "ldr s29, [x24, x23]\n"
+ "fmla v15.4s, v27.4s, v11.4s\n"
+ "fmla v2.4s, v27.4s, v9.4s\n"
+ "fmla v0.4s, v27.4s, v13.4s\n"
+ "fmla v16.4s, v27.4s, v12.4s\n"
+ "fmla v1.4s, v27.4s, v10.4s\n"
+ "fmla v18.4s, v27.4s, v14.4s\n"
+ "fmla v19.4s, v27.4s, v17.4s\n"
+ "fmla v23.4s, v27.4s, v20.4s\n"
+ "fmla v6.4s, v29.4s, v8.4s\n"
+ "ldr s28, [x9, x12]\n"
+ "fmla v2.4s, v29.4s, v11.4s\n"
+ "fmla v5.4s, v29.4s, v9.4s\n"
+ "fmla v16.4s, v29.4s, v13.4s\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "fmla v4.4s, v29.4s, v10.4s\n"
+ "fmla v19.4s, v29.4s, v14.4s\n"
+ "fmla v22.4s, v29.4s, v17.4s\n"
+ "fmla v25.4s, v29.4s, v20.4s\n"
+ "fmla v5.4s, v28.4s, v11.4s\n"
+ "ldr s21, [%[inptr0], x14]\n"
+ "fmla v1.4s, v28.4s, v13.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v4.4s, v28.4s, v12.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v22.4s, v28.4s, v14.4s\n"
+ "ldr s26, [x27, %[input_col_stride1]]\n"
+ "fmla v0.4s, v26.4s, v9.4s\n"
+ "prfm pldl1keep, [%[inptr0], x8]\n"
+ "fmla v4.4s, v21.4s, v13.4s\n"
+ "ldr s21, [x10, x28]\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "ldr s29, [x26, x23]\n"
+ "fmla v15.4s, v21.4s, v8.4s\n"
+ "prfm pldl1keep, [%[inptr0], x25]\n"
+ "fmla v0.4s, v21.4s, v11.4s\n"
+ "fmla v16.4s, v21.4s, v9.4s\n"
+ "fmla v18.4s, v21.4s, v12.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v23.4s, v21.4s, v17.4s\n"
+ "ldr s21, [x24, x12]\n"
+ "fmla v2.4s, v29.4s, v8.4s\n"
+ "fmla v16.4s, v29.4s, v11.4s\n"
+ "fmla v1.4s, v29.4s, v9.4s\n"
+ "fmla v18.4s, v29.4s, v13.4s\n"
+ "fmla v19.4s, v29.4s, v12.4s\n"
+ "fmla v22.4s, v29.4s, v10.4s\n"
+ "fmla v23.4s, v29.4s, v14.4s\n"
+ "fmla v25.4s, v29.4s, v17.4s\n"
+ "fmla v24.4s, v29.4s, v20.4s\n"
+ "ldr s28, [x9, x14]\n"
+ "fmla v5.4s, v21.4s, v8.4s\n"
+ "ldr s27, [x27, x28]\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "add x9, x9, #4\n"
+ "fmla v4.4s, v21.4s, v9.4s\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "fmla v19.4s, v21.4s, v13.4s\n"
+ "prfm pldl1keep, [x9, x8]\n"
+ "fmla v22.4s, v21.4s, v12.4s\n"
+ "fmla v25.4s, v21.4s, v14.4s\n"
+ "fmla v4.4s, v28.4s, v11.4s\n"
+ "ldr s20, [x10, x23]\n"
+ "fmla v0.4s, v27.4s, v8.4s\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "fmla v22.4s, v28.4s, v13.4s\n"
+ "ldr s26, [x26, x12]\n"
+ "fmla v23.4s, v27.4s, v10.4s\n"
+ "ldr s21, [x24, x14]\n"
+ "fmla v16.4s, v20.4s, v8.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v18.4s, v20.4s, v11.4s\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "fmla v19.4s, v20.4s, v9.4s\n"
+ "prfm pldl1keep, [x24, x8]\n"
+ "fmla v23.4s, v20.4s, v12.4s\n"
+ "fmla v25.4s, v20.4s, v10.4s\n"
+ "fmla v24.4s, v20.4s, v17.4s\n"
+ "ldr s28, [x27, x23]\n"
+ "fmla v1.4s, v26.4s, v8.4s\n"
+ "ldr s20, [x10, x12]\n"
+ "fmla v19.4s, v26.4s, v11.4s\n"
+ "fmla v22.4s, v26.4s, v9.4s\n"
+ "fmla v23.4s, v26.4s, v13.4s\n"
+ "fmla v25.4s, v26.4s, v12.4s\n"
+ "fmla v24.4s, v26.4s, v14.4s\n"
+ "ldr s17, [x26, x14]\n"
+ "fmla v4.4s, v21.4s, v8.4s\n"
+ "ldr s26, [x27, x12]\n"
+ "fmla v22.4s, v21.4s, v11.4s\n"
+ "add x26, x26, #4\n"
+ "fmla v25.4s, v21.4s, v13.4s\n"
+ "ldr s27, [x10, x14]\n"
+ "fmla v18.4s, v28.4s, v8.4s\n"
+ "prfm pldl1keep, [x26, #64]\n"
+ "fmla v23.4s, v28.4s, v9.4s\n"
+ "add x10, x10, #4\n"
+ "fmla v24.4s, v28.4s, v10.4s\n"
+ "ldr s28, [x27, x14]\n"
+ "fmla v19.4s, v20.4s, v8.4s\n"
+ "ldr s21, [%[wbptr]]\n"
+ "fmla v23.4s, v20.4s, v11.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v25.4s, v20.4s, v9.4s\n"
+ "fmla v24.4s, v20.4s, v12.4s\n"
+ "fmla v22.4s, v17.4s, v8.4s\n"
+ "ldr s20, [%[wbptr], #4]\n"
+ "fmla v23.4s, v26.4s, v8.4s\n"
+ "ldr s14, [%[wbptr], #8]\n"
+ "fmla v24.4s, v17.4s, v13.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v25.4s, v17.4s, v11.4s\n"
+ "ldr s17, [%[wbptr], #16]\n"
+ "fmax v7.4s, v7.4s, v29.4s\n"
+ "fmax v6.4s, v6.4s, v29.4s\n"
+ "fmla v24.4s, v26.4s, v9.4s\n"
+ "ldr s13, [%[wbptr], #12]\n"
+ "str s7, [%[outptr0]]\n"
+ "fmla v25.4s, v27.4s, v8.4s\n"
+ "str s6, [%[outptr0], %[output_col_stride1]]\n"
+ "fmax v5.4s, v5.4s, v29.4s\n"
+ "fmla v24.4s, v27.4s, v11.4s\n"
+ "ldr s12, [%[wbptr], #20]\n"
+ "str s5, [%[outptr0], x19]\n"
+ "fmax v4.4s, v4.4s, v29.4s\n"
+ "fmax v3.4s, v3.4s, v29.4s\n"
+ "ldr s10, [%[wbptr], #28]\n"
+ "str s4, [%[outptr0], x20]\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "str s3, [x16]\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "ldr s11, [%[wbptr], #24]\n"
+ "str s2, [x16, %[output_col_stride1]]\n"
+ "fmax v22.4s, v22.4s, v29.4s\n"
+ "str s1, [x16, x19]\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str s22, [x16, x20]\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "str s15, [x17]\n"
+ "fmax v19.4s, v19.4s, v29.4s\n"
+ "str s16, [x17, %[output_col_stride1]]\n"
+ "fmax v25.4s, v25.4s, v29.4s\n"
+ "str s19, [x17, x19]\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "str s25, [x17, x20]\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "str s0, [x18]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "str s18, [x18, %[output_col_stride1]]\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "str s23, [x18, x19]\n"
+ "mov v7.16b, v21.16b\n"
+ "str s24, [x18, x20]\n"
+ "mov v3.16b, v21.16b\n"
+ "mov v6.16b, v21.16b\n"
+ "ldr s9, [%[wbptr], #32]\n"
+ "mov v15.16b, v21.16b\n"
+ "ldr s8, [%[wbptr], #36]\n"
+ "mov v2.16b, v21.16b\n"
+ "ldr s22, [%[inptr0]]\n"
+ "mov v5.16b, v21.16b\n"
+ "ldr s19, [x9]\n"
+ "mov v0.16b, v21.16b\n"
+ "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
+ "mov v16.16b, v21.16b\n"
+ "ldr s18, [x24]\n"
+ "mov v1.16b, v21.16b\n"
+ "ldr s27, [x9, %[input_col_stride1]]\n"
+ "mov v4.16b, v21.16b\n"
+ "ldr s28, [%[inptr0], x28]\n"
+ "fmla v7.4s, v22.4s, v20.4s\n"
+ "ldr s25, [x26]\n"
+ "fmla v3.4s, v19.4s, v20.4s\n"
+ "ldr s22, [x24, %[input_col_stride1]]\n"
+ "fmla v6.4s, v23.4s, v20.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmla v7.4s, v19.4s, v17.4s\n"
+ "add x16, x16, #4\n"
+ "fmla v3.4s, v18.4s, v17.4s\n"
+ "add x17, x17, #4\n"
+ "fmla v15.4s, v18.4s, v20.4s\n"
+ "add x18, x18, #4\n"
+ "fmla v7.4s, v23.4s, v14.4s\n"
+ "fmla v3.4s, v27.4s, v14.4s\n"
+ "fmla v7.4s, v18.4s, v10.4s\n"
+ "fmla v7.4s, v27.4s, v12.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "mov v18.16b, v21.16b\n"
+ "ldr s23, [x9, x28]\n"
+ "mov v19.16b, v21.16b\n"
+ "prfm pldl1keep, [x9, x25]\n"
+ "fmla v6.4s, v27.4s, v17.4s\n"
+ "prfm pldl1keep, [%[inptr0], x11]\n"
+ "fmla v2.4s, v27.4s, v20.4s\n"
+ "ldr s24, [%[inptr0], x23]\n"
+ "fmla v7.4s, v28.4s, v13.4s\n"
+ "prfm pldl1keep, [x10, #64]\n"
+ "fmla v6.4s, v28.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x8]\n"
+ "fmla v5.4s, v28.4s, v20.4s\n"
+ "ldr s26, [x10]\n"
+ "fmla v3.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x25]\n"
+ "fmla v15.4s, v25.4s, v17.4s\n"
+ "prfm pldl1keep, [x9, x11]\n"
+ "fmla v0.4s, v25.4s, v20.4s\n"
+ "ldr s25, [x26, %[input_col_stride1]]\n"
+ "fmla v7.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [%[inptr0], x13]\n"
+ "fmla v3.4s, v22.4s, v12.4s\n"
+ "prfm pldl1keep, [x27, #64]\n"
+ "fmla v6.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [x10, x8]\n"
+ "fmla v15.4s, v22.4s, v14.4s\n"
+ "prfm pldl1keep, [x26, x25]\n"
+ "fmla v2.4s, v22.4s, v17.4s\n"
+ "prfm pldl1keep, [x24, x11]\n"
+ "fmla v16.4s, v22.4s, v20.4s\n"
+ "ldr s22, [x24, x28]\n"
+ "fmla v7.4s, v23.4s, v11.4s\n"
+ "prfm pldl1keep, [x9, x13]\n"
+ "fmla v3.4s, v23.4s, v13.4s\n"
+ "prfm pldl1keep, [%[inptr0], x15]\n"
+ "fmla v6.4s, v23.4s, v12.4s\n"
+ "prfm pldl1keep, [x27, x8]\n"
+ "fmla v2.4s, v23.4s, v14.4s\n"
+ "prfm pldl1keep, [x10, x25]\n"
+ "fmla v5.4s, v23.4s, v17.4s\n"
+ "prfm pldl1keep, [x26, x11]\n"
+ "fmla v1.4s, v23.4s, v20.4s\n"
+ "ldr s23, [x9, x23]\n"
+ "fmla v6.4s, v24.4s, v13.4s\n"
+ "prfm pldl1keep, [x24, x13]\n"
+ "fmla v5.4s, v24.4s, v14.4s\n"
+ "prfm pldl1keep, [x9, x15]\n"
+ "fmla v4.4s, v24.4s, v20.4s\n"
+ "ldr s24, [%[inptr0], x12]\n"
+ "fmla v15.4s, v26.4s, v10.4s\n"
+ "prfm pldl1keep, [x27, x25]\n"
+ "fmla v0.4s, v26.4s, v17.4s\n"
+ "ldr s29, [x27]\n"
+ "fmla v3.4s, v25.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x11]\n"
+ "fmla v15.4s, v25.4s, v12.4s\n"
+ "prfm pldl1keep, [x26, x13]\n"
+ "fmla v2.4s, v25.4s, v10.4s\n"
+ "prfm pldl1keep, [x24, x15]\n"
+ "fmla v0.4s, v25.4s, v14.4s\n"
+ "prfm pldl1keep, [x27, x11]\n"
+ "fmla v16.4s, v25.4s, v17.4s\n"
+ "prfm pldl1keep, [x10, x13]\n"
+ "fmla v18.4s, v25.4s, v20.4s\n"
+ "ldr s26, [x10, %[input_col_stride1]]\n"
+ "fmla v7.4s, v22.4s, v8.4s\n"
+ "prfm pldl1keep, [x26, x15]\n"
+ "fmla v3.4s, v22.4s, v11.4s\n"
+ "prfm pldl1keep, [x27, x13]\n"
+ "fmla v6.4s, v22.4s, v9.4s\n"
+ "prfm pldl1keep, [x10, x15]\n"
+ "fmla v15.4s, v22.4s, v13.4s\n"
+ "prfm pldl1keep, [x27, x15]\n"
+ "fmla v2.4s, v22.4s, v12.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v5.4s, v22.4s, v10.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v22.4s, v14.4s\n"
+ "fmla v1.4s, v22.4s, v17.4s\n"
+ "fmla v19.4s, v22.4s, v20.4s\n"
+ "ldr s27, [x26, x28]\n"
+ "fmla v6.4s, v23.4s, v11.4s\n"
+ "fmla v2.4s, v23.4s, v13.4s\n"
+ "fmla v5.4s, v23.4s, v12.4s\n"
+ "fmla v1.4s, v23.4s, v14.4s\n"
+ "fmla v4.4s, v23.4s, v17.4s\n"
+ "fmla v0.4s, v29.4s, v10.4s\n"
+ "mov v22.16b, v21.16b\n"
+ "fmla v15.4s, v26.4s, v9.4s\n"
+ "fmla v5.4s, v24.4s, v13.4s\n"
+ "fmla v16.4s, v26.4s, v10.4s\n"
+ "fmla v22.4s, v23.4s, v20.4s\n"
+ "ldr s29, [x24, x23]\n"
+ "fmla v4.4s, v24.4s, v14.4s\n"
+ "ldr s28, [x9, x12]\n"
+ "fmla v0.4s, v26.4s, v12.4s\n"
+ "fmla v18.4s, v26.4s, v17.4s\n"
+ "mov v23.16b, v21.16b\n"
+ "fmla v3.4s, v27.4s, v8.4s\n"
+ "fmla v15.4s, v27.4s, v11.4s\n"
+ "fmla v2.4s, v27.4s, v9.4s\n"
+ "fmla v0.4s, v27.4s, v13.4s\n"
+ "fmla v16.4s, v27.4s, v12.4s\n"
+ "fmla v1.4s, v27.4s, v10.4s\n"
+ "fmla v18.4s, v27.4s, v14.4s\n"
+ "fmla v19.4s, v27.4s, v17.4s\n"
+ "fmla v23.4s, v27.4s, v20.4s\n"
+ "mov v25.16b, v21.16b\n"
+ "mov v24.16b, v21.16b\n"
+ "fmla v6.4s, v29.4s, v8.4s\n"
+ "fmla v2.4s, v29.4s, v11.4s\n"
+ "fmla v5.4s, v29.4s, v9.4s\n"
+ "fmla v16.4s, v29.4s, v13.4s\n"
+ "fmla v1.4s, v29.4s, v12.4s\n"
+ "fmla v4.4s, v29.4s, v10.4s\n"
+ "fmla v19.4s, v29.4s, v14.4s\n"
+ "fmla v22.4s, v29.4s, v17.4s\n"
+ "fmla v25.4s, v29.4s, v20.4s\n"
+ "ldr s21, [%[inptr0], x14]\n"
+ "fmla v5.4s, v28.4s, v11.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v1.4s, v28.4s, v13.4s\n"
+ "fmla v4.4s, v28.4s, v12.4s\n"
+ "fmla v22.4s, v28.4s, v14.4s\n"
+ "ldr s26, [x27, %[input_col_stride1]]\n"
+ "fmla v0.4s, v26.4s, v9.4s\n"
+ "fmla v18.4s, v26.4s, v10.4s\n"
+ "fmla v4.4s, v21.4s, v13.4s\n"
+ "ldr s21, [x10, x28]\n"
+ "fmla v15.4s, v21.4s, v8.4s\n"
+ "ldr s29, [x26, x23]\n"
+ "fmla v0.4s, v21.4s, v11.4s\n"
+ "fmla v16.4s, v21.4s, v9.4s\n"
+ "fmla v18.4s, v21.4s, v12.4s\n"
+ "fmla v19.4s, v21.4s, v10.4s\n"
+ "fmla v23.4s, v21.4s, v17.4s\n"
+ "ldr s21, [x24, x12]\n"
+ "fmla v2.4s, v29.4s, v8.4s\n"
+ "fmla v16.4s, v29.4s, v11.4s\n"
+ "fmla v1.4s, v29.4s, v9.4s\n"
+ "fmla v18.4s, v29.4s, v13.4s\n"
+ "fmla v19.4s, v29.4s, v12.4s\n"
+ "fmla v22.4s, v29.4s, v10.4s\n"
+ "fmla v23.4s, v29.4s, v14.4s\n"
+ "fmla v25.4s, v29.4s, v17.4s\n"
+ "fmla v24.4s, v29.4s, v20.4s\n"
+ "ldr s28, [x9, x14]\n"
+ "fmla v5.4s, v21.4s, v8.4s\n"
+ "ldr s27, [x27, x28]\n"
+ "fmla v1.4s, v21.4s, v11.4s\n"
+ "add x9, x9, #4\n"
+ "fmla v4.4s, v21.4s, v9.4s\n"
+ "fmla v19.4s, v21.4s, v13.4s\n"
+ "fmla v22.4s, v21.4s, v12.4s\n"
+ "fmla v25.4s, v21.4s, v14.4s\n"
+ "fmla v0.4s, v27.4s, v8.4s\n"
+ "ldr s20, [x10, x23]\n"
+ "fmla v4.4s, v28.4s, v11.4s\n"
+ "fmla v18.4s, v27.4s, v9.4s\n"
+ "fmla v22.4s, v28.4s, v13.4s\n"
+ "ldr s26, [x26, x12]\n"
+ "fmla v23.4s, v27.4s, v10.4s\n"
+ "ldr s21, [x24, x14]\n"
+ "fmla v16.4s, v20.4s, v8.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v18.4s, v20.4s, v11.4s\n"
+ "fmla v19.4s, v20.4s, v9.4s\n"
+ "fmla v23.4s, v20.4s, v12.4s\n"
+ "fmla v25.4s, v20.4s, v10.4s\n"
+ "fmla v24.4s, v20.4s, v17.4s\n"
+ "ldr s28, [x27, x23]\n"
+ "fmla v1.4s, v26.4s, v8.4s\n"
+ "ldr s20, [x10, x12]\n"
+ "fmla v19.4s, v26.4s, v11.4s\n"
+ "fmla v22.4s, v26.4s, v9.4s\n"
+ "fmla v23.4s, v26.4s, v13.4s\n"
+ "fmla v25.4s, v26.4s, v12.4s\n"
+ "fmla v24.4s, v26.4s, v14.4s\n"
+ "ldr s17, [x26, x14]\n"
+ "fmla v4.4s, v21.4s, v8.4s\n"
+ "ldr s26, [x27, x12]\n"
+ "fmla v22.4s, v21.4s, v11.4s\n"
+ "add x26, x26, #4\n"
+ "fmla v25.4s, v21.4s, v13.4s\n"
+ "ldr s27, [x10, x14]\n"
+ "fmla v18.4s, v28.4s, v8.4s\n"
+ "add x10, x10, #4\n"
+ "fmla v23.4s, v28.4s, v9.4s\n"
+ "fmla v24.4s, v28.4s, v10.4s\n"
+ "fmla v19.4s, v20.4s, v8.4s\n"
+ "ldr s28, [x27, x14]\n"
+ "fmla v25.4s, v20.4s, v9.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v23.4s, v20.4s, v11.4s\n"
+ "fmla v24.4s, v20.4s, v12.4s\n"
+ "fmla v22.4s, v17.4s, v8.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v25.4s, v17.4s, v11.4s\n"
+ "fmla v24.4s, v17.4s, v13.4s\n"
+ "fmla v23.4s, v26.4s, v8.4s\n"
+ "fmax v7.4s, v7.4s, v29.4s\n"
+ "fmla v25.4s, v27.4s, v8.4s\n"
+ "fmax v6.4s, v6.4s, v29.4s\n"
+ "str s7, [%[outptr0]]\n"
+ "fmla v24.4s, v26.4s, v9.4s\n"
+ "str s6, [%[outptr0], %[output_col_stride1]]\n"
+ "fmax v5.4s, v5.4s, v29.4s\n"
+ "fmax v4.4s, v4.4s, v29.4s\n"
+ "fmax v3.4s, v3.4s, v29.4s\n"
+ "str s5, [%[outptr0], x19]\n"
+ "fmla v24.4s, v27.4s, v11.4s\n"
+ "str s4, [%[outptr0], x20]\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "str s3, [x16]\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "str s2, [x16, %[output_col_stride1]]\n"
+ "fmla v24.4s, v28.4s, v8.4s\n"
+ "str s1, [x16, x19]\n"
+ "fmax v22.4s, v22.4s, v29.4s\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "str s22, [x16, x20]\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "str s15, [x17]\n"
+ "fmax v19.4s, v19.4s, v29.4s\n"
+ "str s16, [x17, %[output_col_stride1]]\n"
+ "fmax v25.4s, v25.4s, v29.4s\n"
+ "str s19, [x17, x19]\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "str s25, [x17, x20]\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "str s0, [x18]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "str s18, [x18, %[output_col_stride1]]\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "str s23, [x18, x19]\n"
+ "add x16, x16, #4\n"
+ "str s24, [x18, x20]\n"
+ "add x17, x17, #4\n"
+ "add x18, x18, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
+ : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
+ );
+}
- "uptr1 .req x0\n"
- "uptr2 .req x1\n"
- "uptr3 .req x2\n"
- "uptr4 .req x3\n"
- "uptr5 .req x4\n"
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *inptrs[6][6],
+ float *outptrs[4][4]
+)
+{
+ __asm __volatile(
+ "mov x27, xzr\n"
+ "mov x28, xzr\n"
+ "and x19, %[n_channels], #3\n"
+ "lsr x26, %[n_channels], #2\n"
+ "cbz x26, 4f\n"
+ "1:\n"
+ "ldr q25, [%[wbptr]]\n"
+ "ldr x25, [%[inptrs], 0]\n"
+ "mov v2.16b, v25.16b\n"
+ "ldr q22, [%[wbptr], #16]\n"
+ "mov v16.16b, v25.16b\n"
+ "ldr q9, [%[wbptr], #32]\n"
+ "mov v18.16b, v25.16b\n"
+ "ldr q8, [%[wbptr], #48]\n"
+ "mov v13.16b, v25.16b\n"
+ "ldr q19, [%[wbptr], #64]\n"
+ "mov v0.16b, v25.16b\n"
+ "ldr q7, [%[wbptr], #80]\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr q6, [%[wbptr], #96]\n"
+ "mov v14.16b, v25.16b\n"
+ "ldr q5, [%[wbptr], #112]\n"
+ "mov v12.16b, v25.16b\n"
+ "ldr q4, [%[wbptr], #128]\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr q3, [%[wbptr], #144]\n"
+ "ldr q27, [x25, x27]\n"
+ "ldr x17, [%[inptrs], 48]\n"
+ "fmla v2.4s, v27.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 8]\n"
+ "ldr q26, [x17, x27]\n"
+ "ldr x24, [%[inptrs], 96]\n"
+ "fmla v16.4s, v26.4s, v22.4s\n"
+ "ldr q31, [x25, x27]\n"
+ "ldr q28, [x24, x27]\n"
+ "ldr x17, [%[inptrs], 56]\n"
+ "fmla v2.4s, v26.4s, v19.4s\n"
+ "ldr x25, [%[inptrs], 16]\n"
+ "ldr q29, [x17, x27]\n"
+ "ldr x18, [%[inptrs], 144]\n"
+ "ldr x24, [%[inptrs], 104]\n"
+ "subs x26, x26, #1\n"
+ "ldr q30, [x25, x27]\n"
+ "ldr q27, [x18, x27]\n"
+ "ldr q21, [x24, x27]\n"
+ "fmla v2.4s, v31.4s, v9.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "mov v1.16b, v25.16b\n"
+ "ldr x17, [%[inptrs], 64]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr x25, [%[inptrs], 24]\n"
+ "fmla v18.4s, v31.4s, v22.4s\n"
+ "ldr q23, [x17, x27]\n"
+ "fmla v2.4s, v28.4s, v5.4s\n"
+ "ldr x15, [%[inptrs], 192]\n"
+ "fmla v16.4s, v28.4s, v19.4s\n"
+ "ldr x18, [%[inptrs], 152]\n"
+ "fmla v13.4s, v28.4s, v22.4s\n"
+ "ldr q26, [x25, x27]\n"
+ "fmla v18.4s, v29.4s, v19.4s\n"
+ "ldr x24, [%[inptrs], 112]\n"
+ "fmla v2.4s, v29.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 72]\n"
+ "fmla v16.4s, v29.4s, v9.4s\n"
+ "ldr x25, [%[inptrs], 32]\n"
+ "fmla v0.4s, v29.4s, v22.4s\n"
+ "ldr q28, [x15, x27]\n"
+ "fmla v18.4s, v30.4s, v9.4s\n"
+ "ldr x16, [%[inptrs], 240]\n"
+ "fmla v2.4s, v30.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 200]\n"
+ "fmla v17.4s, v30.4s, v22.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v16.4s, v27.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 160]\n"
+ "fmla v13.4s, v27.4s, v19.4s\n"
+ "ldr x20, [%[outptrs], 0]\n"
+ "fmla v14.4s, v27.4s, v22.4s\n"
+ "ldr q20, [x24, x27]\n"
+ "fmla v2.4s, v21.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 120]\n"
+ "fmla v16.4s, v21.4s, v7.4s\n"
+ "ldr x21, [%[outptrs], 32]\n"
+ "fmla v18.4s, v21.4s, v5.4s\n"
+ "ldr x22, [%[outptrs], 64]\n"
+ "fmla v13.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 96]\n"
+ "fmla v0.4s, v21.4s, v19.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v12.4s, v21.4s, v22.4s\n"
+ "ldr q24, [x17, x27]\n"
+ "fmla v2.4s, v23.4s, v6.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v23.4s, v8.4s\n"
+ "ldr x17, [%[inptrs], 80]\n"
+ "fmla v18.4s, v23.4s, v7.4s\n"
+ "subs x26, x26, #1\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "fmla v17.4s, v23.4s, v19.4s\n"
+ "fmla v15.4s, v23.4s, v22.4s\n"
+ "ldr q23, [x25, x27]\n"
+ "fmla v1.4s, v26.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 40]\n"
+ "fmla v18.4s, v26.4s, v8.4s\n"
+ "fmla v13.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v26.4s, v9.4s\n"
+ "ldr q30, [x16, x27]\n"
+ "fmla v14.4s, v28.4s, v19.4s\n"
+ "ldr q26, [x15, x27]\n"
+ "fmla v16.4s, v29.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 248]\n"
+ "fmla v13.4s, v29.4s, v7.4s\n"
+ "ldr x15, [%[inptrs], 208]\n"
+ "fmla v0.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v19.4s\n"
+ "fmla v14.4s, v29.4s, v9.4s\n"
+ "fmla v10.4s, v29.4s, v22.4s\n"
+ "mov v11.16b, v25.16b\n"
+ "fmla v2.4s, v20.4s, v3.4s\n"
+ "fmla v16.4s, v20.4s, v6.4s\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v13.4s, v20.4s, v8.4s\n"
+ "fmla v0.4s, v20.4s, v7.4s\n"
+ "fmla v17.4s, v20.4s, v5.4s\n"
+ "fmla v12.4s, v20.4s, v9.4s\n"
+ "fmla v15.4s, v20.4s, v19.4s\n"
+ "fmla v11.4s, v20.4s, v22.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v18.4s, v24.4s, v6.4s\n"
+ "fmla v0.4s, v24.4s, v8.4s\n"
+ "fmla v1.4s, v24.4s, v19.4s\n"
+ "fmla v17.4s, v24.4s, v7.4s\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "mov v20.16b, v25.16b\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "fmla v21.4s, v24.4s, v22.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 168]\n"
+ "fmla v17.4s, v23.4s, v8.4s\n"
+ "ldr q30, [x24, x27]\n"
+ "fmla v13.4s, v26.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 128]\n"
+ "fmla v14.4s, v26.4s, v7.4s\n"
+ "fmla v12.4s, v26.4s, v5.4s\n"
+ "fmla v10.4s, v26.4s, v19.4s\n"
+ "ldr q31, [x17, x27]\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "ldr x17, [%[inptrs], 88]\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v0.4s, v27.4s, v4.4s\n"
+ "fmla v14.4s, v27.4s, v8.4s\n"
+ "fmla v12.4s, v27.4s, v7.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "fmla v10.4s, v27.4s, v9.4s\n"
+ "fmla v11.4s, v27.4s, v19.4s\n"
+ "fmla v20.4s, v27.4s, v22.4s\n"
+ "mov v24.16b, v25.16b\n"
+ "mov v23.16b, v25.16b\n"
+ "fmla v18.4s, v30.4s, v3.4s\n"
+ "fmla v0.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "fmla v12.4s, v30.4s, v8.4s\n"
+ "fmla v15.4s, v30.4s, v7.4s\n"
+ "fmla v1.4s, v30.4s, v5.4s\n"
+ "fmla v11.4s, v30.4s, v9.4s\n"
+ "fmla v21.4s, v30.4s, v19.4s\n"
+ "fmla v24.4s, v30.4s, v22.4s\n"
+ "ldr q25, [x25, x27]\n"
+ "fmla v17.4s, v31.4s, v6.4s\n"
+ "ldr x25, [%[inptrs], 0]\n"
+ "fmla v15.4s, v31.4s, v8.4s\n"
+ "fmla v1.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v31.4s, v9.4s\n"
+ "ldr q26, [x16, x27]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 256]\n"
+ "fmla v10.4s, v26.4s, v5.4s\n"
+ "ldr q31, [x15, x27]\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v13.4s, v31.4s, v3.4s\n"
+ "ldr x15, [%[inptrs], 216]\n"
+ "fmla v14.4s, v31.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 176]\n"
+ "fmla v12.4s, v31.4s, v4.4s\n"
+ "fmla v10.4s, v31.4s, v7.4s\n"
+ "fmla v11.4s, v31.4s, v5.4s\n"
+ "fmla v20.4s, v31.4s, v19.4s\n"
+ "fmla v0.4s, v29.4s, v3.4s\n"
+ "ldr q28, [x24, x27]\n"
+ "fmla v15.4s, v29.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 136]\n"
+ "fmla v12.4s, v29.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v8.4s\n"
+ "fmla v11.4s, v29.4s, v7.4s\n"
+ "fmla v21.4s, v29.4s, v5.4s\n"
+ "fmla v20.4s, v29.4s, v9.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v23.4s, v29.4s, v22.4s\n"
+ "ldr q25, [x17, x27]\n"
+ "fmla v17.4s, v28.4s, v3.4s\n"
+ "ldr q29, [x16, x27]\n"
+ "fmla v15.4s, v28.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 264]\n"
+ "fmla v1.4s, v28.4s, v4.4s\n"
+ "ldr x17, [%[inptrs], 48]\n"
+ "fmla v11.4s, v28.4s, v8.4s\n"
+ "fmla v21.4s, v28.4s, v7.4s\n"
+ "fmla v24.4s, v28.4s, v9.4s\n"
+ "ldr q22, [x15, x27]\n"
+ "fmla v14.4s, v29.4s, v3.4s\n"
+ "ldr x15, [%[inptrs], 224]\n"
+ "fmla v1.4s, v25.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v4.4s\n"
+ "fmla v21.4s, v25.4s, v8.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v20.4s, v29.4s, v5.4s\n"
+ "ldr q26, [x24, x27]\n"
+ "fmla v12.4s, v22.4s, v3.4s\n"
+ "ldr x18, [%[inptrs], 184]\n"
+ "fmla v10.4s, v22.4s, v6.4s\n"
+ "ldr x24, [%[inptrs], 96]\n"
+ "fmla v11.4s, v22.4s, v4.4s\n"
+ "fmla v24.4s, v22.4s, v5.4s\n"
+ "fmla v20.4s, v22.4s, v7.4s\n"
+ "fmla v23.4s, v22.4s, v19.4s\n"
+ "fmla v15.4s, v27.4s, v3.4s\n"
+ "ldr q25, [x16, x27]\n"
+ "fmla v21.4s, v27.4s, v4.4s\n"
+ "ldr q31, [x15, x27]\n"
+ "fmla v11.4s, v27.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 272]\n"
+ "fmla v20.4s, v27.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 232]\n"
+ "fmla v24.4s, v27.4s, v7.4s\n"
+ "fmla v23.4s, v27.4s, v9.4s\n"
+ "fmla v1.4s, v26.4s, v3.4s\n"
+ "ldr q22, [x18, x27]\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr q19, [x16, x27]\n"
+ "fmla v10.4s, v25.4s, v3.4s\n"
+ "ldr x16, [%[inptrs], 280]\n"
+ "fmla v24.4s, v26.4s, v8.4s\n"
+ "ldr q28, [x15, x27]\n"
+ "fmla v20.4s, v25.4s, v4.4s\n"
+ "ldr x18, [%[inptrs], 144]\n"
+ "fmla v23.4s, v25.4s, v5.4s\n"
+ "ldr q30, [x16, x27]\n"
+ "fmla v11.4s, v31.4s, v3.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v24.4s, v31.4s, v4.4s\n"
+ "ldr q27, [x25, x27]\n"
+ "fmla v20.4s, v31.4s, v6.4s\n"
+ "ldr x25, [%[inptrs], 8]\n"
+ "fmla v23.4s, v31.4s, v7.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v21.4s, v22.4s, v3.4s\n"
+ "ldr q26, [x17, x27]\n"
+ "fmla v24.4s, v22.4s, v6.4s\n"
+ "ldr x17, [%[inptrs], 56]\n"
+ "fmla v20.4s, v19.4s, v3.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmla v23.4s, v22.4s, v8.4s\n"
+ "ldr q25, [%[wbptr]]\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "ldr q22, [%[wbptr], #16]\n"
+ "str q2, [x20, x28]\n"
+ "fmla v24.4s, v28.4s, v3.4s\n"
+ "fmax v17.4s, v17.4s, v29.4s\n"
+ "ldr q9, [%[wbptr], #32]\n"
+ "fmla v23.4s, v19.4s, v4.4s\n"
+ "ldr q8, [%[wbptr], #48]\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "ldr q19, [%[wbptr], #64]\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 8]\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str q18, [x20, x28]\n"
+ "fmla v23.4s, v28.4s, v6.4s\n"
+ "str q16, [x21, x28]\n"
+ "fmax v21.4s, v21.4s, v29.4s\n"
+ "fmax v13.4s, v13.4s, v29.4s\n"
+ "ldr q7, [%[wbptr], #80]\n"
+ "fmax v12.4s, v12.4s, v29.4s\n"
+ "ldr q5, [%[wbptr], #112]\n"
+ "fmla v23.4s, v30.4s, v3.4s\n"
+ "ldr q6, [%[wbptr], #96]\n"
+ "str q13, [x22, x28]\n"
+ "fmax v11.4s, v11.4s, v29.4s\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "ldr q4, [%[wbptr], #128]\n"
+ "fmax v14.4s, v14.4s, v29.4s\n"
+ "ldr q31, [x25, x27]\n"
+ "fmax v10.4s, v10.4s, v29.4s\n"
+ "ldr q3, [%[wbptr], #144]\n"
+ "fmax v20.4s, v20.4s, v29.4s\n"
+ "ldr q28, [x24, x27]\n"
+ "str q14, [x23, x28]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "mov v2.16b, v25.16b\n"
+ "ldr q29, [x17, x27]\n"
+ "ldr x20, [%[outptrs], 16]\n"
+ "ldr x21, [%[outptrs], 40]\n"
+ "ldr x22, [%[outptrs], 72]\n"
+ "ldr x23, [%[outptrs], 104]\n"
+ "ldr x25, [%[inptrs], 16]\n"
+ "ldr x24, [%[inptrs], 104]\n"
+ "str q17, [x20, x28]\n"
+ "mov v16.16b, v25.16b\n"
+ "str q0, [x21, x28]\n"
+ "mov v18.16b, v25.16b\n"
+ "str q12, [x22, x28]\n"
+ "mov v13.16b, v25.16b\n"
+ "str q10, [x23, x28]\n"
+ "mov v0.16b, v25.16b\n"
+ "fmla v2.4s, v27.4s, v22.4s\n"
+ "ldr q30, [x25, x27]\n"
+ "fmla v16.4s, v26.4s, v22.4s\n"
+ "ldr x20, [%[outptrs], 24]\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr x21, [%[outptrs], 48]\n"
+ "str q1, [x20, x28]\n"
+ "mov v14.16b, v25.16b\n"
+ "str q15, [x21, x28]\n"
+ "mov v12.16b, v25.16b\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr x21, [%[outptrs], 56]\n"
+ "fmla v2.4s, v26.4s, v19.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "str q21, [x21, x28]\n"
+ "ldr x22, [%[outptrs], 80]\n"
+ "ldr q21, [x24, x27]\n"
+ "ldr x23, [%[outptrs], 112]\n"
+ "str q11, [x22, x28]\n"
+ "fmla v2.4s, v31.4s, v9.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 88]\n"
+ "ldr x23, [%[outptrs], 120]\n"
+ "str q24, [x22, x28]\n"
+ "str q23, [x23, x28]\n"
+ "add x28, x28, #16\n"
+ "bne 2b\n"
+ "3:\n"
+ "mov v1.16b, v25.16b\n"
+ "ldr x17, [%[inptrs], 64]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr x25, [%[inptrs], 24]\n"
+ "mov v11.16b, v25.16b\n"
+ "ldr x15, [%[inptrs], 192]\n"
+ "fmla v18.4s, v31.4s, v22.4s\n"
+ "ldr q23, [x17, x27]\n"
+ "fmla v2.4s, v28.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 152]\n"
+ "fmla v16.4s, v28.4s, v19.4s\n"
+ "ldr x24, [%[inptrs], 112]\n"
+ "fmla v13.4s, v28.4s, v22.4s\n"
+ "ldr q26, [x25, x27]\n"
+ "fmla v18.4s, v29.4s, v19.4s\n"
+ "ldr x17, [%[inptrs], 72]\n"
+ "fmla v2.4s, v29.4s, v7.4s\n"
+ "ldr x25, [%[inptrs], 32]\n"
+ "fmla v16.4s, v29.4s, v9.4s\n"
+ "ldr x16, [%[inptrs], 240]\n"
+ "fmla v0.4s, v29.4s, v22.4s\n"
+ "ldr q28, [x15, x27]\n"
+ "fmla v18.4s, v30.4s, v9.4s\n"
+ "ldr x15, [%[inptrs], 200]\n"
+ "fmla v2.4s, v30.4s, v8.4s\n"
+ "ldr x20, [%[outptrs], 0]\n"
+ "fmla v17.4s, v30.4s, v22.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v16.4s, v27.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 160]\n"
+ "fmla v13.4s, v27.4s, v19.4s\n"
+ "ldr x21, [%[outptrs], 32]\n"
+ "fmla v14.4s, v27.4s, v22.4s\n"
+ "ldr q20, [x24, x27]\n"
+ "fmla v2.4s, v21.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 120]\n"
+ "fmla v16.4s, v21.4s, v7.4s\n"
+ "ldr x22, [%[outptrs], 64]\n"
+ "fmla v18.4s, v21.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 96]\n"
+ "fmla v13.4s, v21.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v0.4s, v21.4s, v19.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v12.4s, v21.4s, v22.4s\n"
+ "ldr q24, [x17, x27]\n"
+ "fmla v2.4s, v23.4s, v6.4s\n"
+ "ldr x17, [%[inptrs], 80]\n"
+ "fmla v16.4s, v23.4s, v8.4s\n"
+ "fmla v18.4s, v23.4s, v7.4s\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "fmla v17.4s, v23.4s, v19.4s\n"
+ "fmla v15.4s, v23.4s, v22.4s\n"
+ "ldr q23, [x25, x27]\n"
+ "fmla v1.4s, v26.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 40]\n"
+ "fmla v18.4s, v26.4s, v8.4s\n"
+ "fmla v13.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v26.4s, v9.4s\n"
+ "ldr q30, [x16, x27]\n"
+ "fmla v14.4s, v28.4s, v19.4s\n"
+ "ldr q26, [x15, x27]\n"
+ "fmla v16.4s, v29.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 248]\n"
+ "fmla v13.4s, v29.4s, v7.4s\n"
+ "ldr x15, [%[inptrs], 208]\n"
+ "fmla v0.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v19.4s\n"
+ "fmla v14.4s, v29.4s, v9.4s\n"
+ "fmla v10.4s, v29.4s, v22.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v2.4s, v20.4s, v3.4s\n"
+ "fmla v16.4s, v20.4s, v6.4s\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v13.4s, v20.4s, v8.4s\n"
+ "fmla v0.4s, v20.4s, v7.4s\n"
+ "fmla v17.4s, v20.4s, v5.4s\n"
+ "fmla v12.4s, v20.4s, v9.4s\n"
+ "fmla v15.4s, v20.4s, v19.4s\n"
+ "fmla v11.4s, v20.4s, v22.4s\n"
+ "mov v20.16b, v25.16b\n"
+ "fmla v18.4s, v24.4s, v6.4s\n"
+ "fmla v0.4s, v24.4s, v8.4s\n"
+ "fmla v1.4s, v24.4s, v19.4s\n"
+ "fmla v17.4s, v24.4s, v7.4s\n"
+ "fmla v21.4s, v24.4s, v22.4s\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "ldr q30, [x24, x27]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 168]\n"
+ "fmla v17.4s, v23.4s, v8.4s\n"
+ "ldr q31, [x17, x27]\n"
+ "fmla v13.4s, v26.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 128]\n"
+ "fmla v14.4s, v26.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 88]\n"
+ "fmla v12.4s, v26.4s, v5.4s\n"
+ "fmla v10.4s, v26.4s, v19.4s\n"
+ "mov v24.16b, v25.16b\n"
+ "mov v23.16b, v25.16b\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v0.4s, v27.4s, v4.4s\n"
+ "fmla v14.4s, v27.4s, v8.4s\n"
+ "fmla v12.4s, v27.4s, v7.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "fmla v10.4s, v27.4s, v9.4s\n"
+ "fmla v11.4s, v27.4s, v19.4s\n"
+ "fmla v20.4s, v27.4s, v22.4s\n"
+ "ldr q25, [x25, x27]\n"
+ "fmla v18.4s, v30.4s, v3.4s\n"
+ "fmla v0.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "fmla v12.4s, v30.4s, v8.4s\n"
+ "fmla v15.4s, v30.4s, v7.4s\n"
+ "fmla v1.4s, v30.4s, v5.4s\n"
+ "fmla v11.4s, v30.4s, v9.4s\n"
+ "fmla v21.4s, v30.4s, v19.4s\n"
+ "fmla v24.4s, v30.4s, v22.4s\n"
+ "ldr q26, [x16, x27]\n"
+ "fmla v17.4s, v31.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 256]\n"
+ "fmla v15.4s, v31.4s, v8.4s\n"
+ "fmla v1.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v31.4s, v9.4s\n"
+ "ldr q31, [x15, x27]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr x15, [%[inptrs], 216]\n"
+ "fmla v10.4s, v26.4s, v5.4s\n"
+ "ldr q29, [x18, x27]\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "ldr q28, [x24, x27]\n"
+ "fmla v13.4s, v31.4s, v3.4s\n"
+ "ldr x18, [%[inptrs], 176]\n"
+ "fmla v14.4s, v31.4s, v6.4s\n"
+ "ldr x24, [%[inptrs], 136]\n"
+ "fmla v12.4s, v31.4s, v4.4s\n"
+ "fmla v10.4s, v31.4s, v7.4s\n"
+ "fmla v11.4s, v31.4s, v5.4s\n"
+ "fmla v20.4s, v31.4s, v19.4s\n"
+ "fmla v0.4s, v29.4s, v3.4s\n"
+ "ldr q25, [x17, x27]\n"
+ "fmla v15.4s, v29.4s, v4.4s\n"
+ "fmla v21.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v8.4s\n"
+ "fmla v11.4s, v29.4s, v7.4s\n"
+ "fmla v20.4s, v29.4s, v9.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v23.4s, v29.4s, v22.4s\n"
+ "fmla v17.4s, v28.4s, v3.4s\n"
+ "ldr q29, [x16, x27]\n"
+ "fmla v15.4s, v28.4s, v6.4s\n"
+ "ldr q22, [x15, x27]\n"
+ "fmla v1.4s, v28.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 264]\n"
+ "fmla v11.4s, v28.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 224]\n"
+ "fmla v21.4s, v28.4s, v7.4s\n"
+ "fmla v24.4s, v28.4s, v9.4s\n"
+ "fmla v14.4s, v29.4s, v3.4s\n"
+ "ldr q27, [x18, x27]\n"
+ "fmla v1.4s, v25.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 184]\n"
+ "fmla v10.4s, v29.4s, v4.4s\n"
+ "fmla v20.4s, v29.4s, v5.4s\n"
+ "fmla v21.4s, v25.4s, v8.4s\n"
+ "ldr q26, [x24, x27]\n"
+ "fmla v12.4s, v22.4s, v3.4s\n"
+ "ldr q25, [x16, x27]\n"
+ "fmla v11.4s, v22.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 272]\n"
+ "fmla v10.4s, v22.4s, v6.4s\n"
+ "fmla v20.4s, v22.4s, v7.4s\n"
+ "fmla v24.4s, v22.4s, v5.4s\n"
+ "fmla v23.4s, v22.4s, v19.4s\n"
+ "fmla v15.4s, v27.4s, v3.4s\n"
+ "ldr q31, [x15, x27]\n"
+ "fmla v11.4s, v27.4s, v6.4s\n"
+ "ldr q22, [x18, x27]\n"
+ "fmla v21.4s, v27.4s, v4.4s\n"
+ "ldr x15, [%[inptrs], 232]\n"
+ "fmla v20.4s, v27.4s, v8.4s\n"
+ "fmla v24.4s, v27.4s, v7.4s\n"
+ "fmla v23.4s, v27.4s, v9.4s\n"
+ "ldr q19, [x16, x27]\n"
+ "fmla v1.4s, v26.4s, v3.4s\n"
+ "ldr q28, [x15, x27]\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 280]\n"
+ "fmla v24.4s, v26.4s, v8.4s\n"
+ "fmla v10.4s, v25.4s, v3.4s\n"
+ "fmla v20.4s, v25.4s, v4.4s\n"
+ "ldr q30, [x16, x27]\n"
+ "fmla v23.4s, v25.4s, v5.4s\n"
+ "add x27, x27, #16\n"
+ "fmla v11.4s, v31.4s, v3.4s\n"
+ "fmla v21.4s, v22.4s, v3.4s\n"
+ "fmla v24.4s, v31.4s, v4.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v20.4s, v31.4s, v6.4s\n"
+ "fmla v23.4s, v31.4s, v7.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "fmla v24.4s, v22.4s, v6.4s\n"
+ "fmax v17.4s, v17.4s, v29.4s\n"
+ "fmla v20.4s, v19.4s, v3.4s\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "str q2, [x20, x28]\n"
+ "fmla v23.4s, v22.4s, v8.4s\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 8]\n"
+ "fmla v24.4s, v28.4s, v3.4s\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "str q18, [x20, x28]\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str q16, [x21, x28]\n"
+ "fmla v23.4s, v19.4s, v4.4s\n"
+ "fmax v21.4s, v21.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 16]\n"
+ "fmax v13.4s, v13.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 40]\n"
+ "str q17, [x20, x28]\n"
+ "fmax v12.4s, v12.4s, v29.4s\n"
+ "str q0, [x21, x28]\n"
+ "fmla v23.4s, v28.4s, v6.4s\n"
+ "str q13, [x22, x28]\n"
+ "fmax v11.4s, v11.4s, v29.4s\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 24]\n"
+ "fmax v14.4s, v14.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 48]\n"
+ "str q1, [x20, x28]\n"
+ "fmla v23.4s, v30.4s, v3.4s\n"
+ "str q15, [x21, x28]\n"
+ "fmax v10.4s, v10.4s, v29.4s\n"
+ "str q14, [x23, x28]\n"
+ "fmax v20.4s, v20.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 56]\n"
+ "ldr x22, [%[outptrs], 72]\n"
+ "ldr x23, [%[outptrs], 104]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "str q21, [x21, x28]\n"
+ "str q12, [x22, x28]\n"
+ "str q10, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 80]\n"
+ "ldr x23, [%[outptrs], 112]\n"
+ "str q11, [x22, x28]\n"
+ "str q20, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 88]\n"
+ "ldr x23, [%[outptrs], 120]\n"
+ "str q24, [x22, x28]\n"
+ "str q23, [x23, x28]\n"
+ "add x28, x28, #16\n"
+ "4:\n"
+ "cbz x19, 7f\n"
+ "ldr s25, [%[wbptr]]\n"
+ "mov v2.16b, v25.16b\n"
+ "ldr s22, [%[wbptr], #4]\n"
+ "mov v16.16b, v25.16b\n"
+ "ldr s9, [%[wbptr], #8]\n"
+ "mov v18.16b, v25.16b\n"
+ "ldr s8, [%[wbptr], #12]\n"
+ "mov v13.16b, v25.16b\n"
+ "ldr s19, [%[wbptr], #16]\n"
+ "mov v0.16b, v25.16b\n"
+ "ldr s7, [%[wbptr], #20]\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr s6, [%[wbptr], #24]\n"
+ "mov v14.16b, v25.16b\n"
+ "ldr s5, [%[wbptr], #28]\n"
+ "mov v12.16b, v25.16b\n"
+ "ldr s4, [%[wbptr], #32]\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr s3, [%[wbptr], #36]\n"
+ "ldr x25, [%[inptrs], 0]\n"
+ "ldr x17, [%[inptrs], 48]\n"
+ "ldr x24, [%[inptrs], 96]\n"
+ "ldr x18, [%[inptrs], 144]\n"
+ "subs x19, x19, #1\n"
+ "ldr s27, [x25, x27]\n"
+ "fmla v2.4s, v27.4s, v22.4s\n"
+ "ldr s26, [x17, x27]\n"
+ "fmla v16.4s, v26.4s, v22.4s\n"
+ "ldr s28, [x24, x27]\n"
+ "ldr s27, [x18, x27]\n"
+ "ldr x25, [%[inptrs], 8]\n"
+ "ldr x17, [%[inptrs], 56]\n"
+ "ldr x24, [%[inptrs], 104]\n"
+ "ldr s31, [x25, x27]\n"
+ "fmla v2.4s, v26.4s, v19.4s\n"
+ "ldr s29, [x17, x27]\n"
+ "ldr s21, [x24, x27]\n"
+ "ldr x25, [%[inptrs], 16]\n"
+ "ldr s30, [x25, x27]\n"
+ "fmla v2.4s, v31.4s, v9.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "mov v1.16b, v25.16b\n"
+ "ldr x17, [%[inptrs], 64]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr x25, [%[inptrs], 24]\n"
+ "fmla v18.4s, v31.4s, v22.4s\n"
+ "ldr s23, [x17, x27]\n"
+ "fmla v2.4s, v28.4s, v5.4s\n"
+ "ldr x15, [%[inptrs], 192]\n"
+ "fmla v16.4s, v28.4s, v19.4s\n"
+ "ldr x18, [%[inptrs], 152]\n"
+ "fmla v13.4s, v28.4s, v22.4s\n"
+ "ldr s26, [x25, x27]\n"
+ "fmla v18.4s, v29.4s, v19.4s\n"
+ "ldr x24, [%[inptrs], 112]\n"
+ "fmla v2.4s, v29.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 72]\n"
+ "fmla v16.4s, v29.4s, v9.4s\n"
+ "ldr x25, [%[inptrs], 32]\n"
+ "fmla v0.4s, v29.4s, v22.4s\n"
+ "ldr s28, [x15, x27]\n"
+ "fmla v18.4s, v30.4s, v9.4s\n"
+ "ldr x16, [%[inptrs], 240]\n"
+ "fmla v2.4s, v30.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 200]\n"
+ "fmla v17.4s, v30.4s, v22.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v16.4s, v27.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 160]\n"
+ "fmla v13.4s, v27.4s, v19.4s\n"
+ "ldr x20, [%[outptrs], 0]\n"
+ "fmla v14.4s, v27.4s, v22.4s\n"
+ "ldr s20, [x24, x27]\n"
+ "fmla v2.4s, v21.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 120]\n"
+ "fmla v16.4s, v21.4s, v7.4s\n"
+ "ldr x21, [%[outptrs], 32]\n"
+ "fmla v18.4s, v21.4s, v5.4s\n"
+ "ldr x22, [%[outptrs], 64]\n"
+ "fmla v13.4s, v21.4s, v9.4s\n"
+ "ldr x23, [%[outptrs], 96]\n"
+ "fmla v0.4s, v21.4s, v19.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v12.4s, v21.4s, v22.4s\n"
+ "ldr s24, [x17, x27]\n"
+ "fmla v2.4s, v23.4s, v6.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v16.4s, v23.4s, v8.4s\n"
+ "ldr x17, [%[inptrs], 80]\n"
+ "fmla v18.4s, v23.4s, v7.4s\n"
+ "subs x19, x19, #1\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "fmla v17.4s, v23.4s, v19.4s\n"
+ "fmla v15.4s, v23.4s, v22.4s\n"
+ "ldr s23, [x25, x27]\n"
+ "fmla v1.4s, v26.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 40]\n"
+ "fmla v18.4s, v26.4s, v8.4s\n"
+ "fmla v13.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v26.4s, v9.4s\n"
+ "ldr s30, [x16, x27]\n"
+ "fmla v14.4s, v28.4s, v19.4s\n"
+ "ldr s26, [x15, x27]\n"
+ "fmla v16.4s, v29.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 248]\n"
+ "fmla v13.4s, v29.4s, v7.4s\n"
+ "ldr x15, [%[inptrs], 208]\n"
+ "fmla v0.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v19.4s\n"
+ "fmla v14.4s, v29.4s, v9.4s\n"
+ "fmla v10.4s, v29.4s, v22.4s\n"
+ "mov v11.16b, v25.16b\n"
+ "fmla v2.4s, v20.4s, v3.4s\n"
+ "fmla v16.4s, v20.4s, v6.4s\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v13.4s, v20.4s, v8.4s\n"
+ "fmla v0.4s, v20.4s, v7.4s\n"
+ "fmla v17.4s, v20.4s, v5.4s\n"
+ "fmla v12.4s, v20.4s, v9.4s\n"
+ "fmla v15.4s, v20.4s, v19.4s\n"
+ "fmla v11.4s, v20.4s, v22.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v18.4s, v24.4s, v6.4s\n"
+ "fmla v0.4s, v24.4s, v8.4s\n"
+ "fmla v1.4s, v24.4s, v19.4s\n"
+ "fmla v17.4s, v24.4s, v7.4s\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "mov v20.16b, v25.16b\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "fmla v21.4s, v24.4s, v22.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 168]\n"
+ "fmla v17.4s, v23.4s, v8.4s\n"
+ "ldr s30, [x24, x27]\n"
+ "fmla v13.4s, v26.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 128]\n"
+ "fmla v14.4s, v26.4s, v7.4s\n"
+ "fmla v12.4s, v26.4s, v5.4s\n"
+ "fmla v10.4s, v26.4s, v19.4s\n"
+ "ldr s31, [x17, x27]\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "ldr x17, [%[inptrs], 88]\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v0.4s, v27.4s, v4.4s\n"
+ "fmla v14.4s, v27.4s, v8.4s\n"
+ "fmla v12.4s, v27.4s, v7.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "fmla v10.4s, v27.4s, v9.4s\n"
+ "fmla v11.4s, v27.4s, v19.4s\n"
+ "fmla v20.4s, v27.4s, v22.4s\n"
+ "mov v24.16b, v25.16b\n"
+ "mov v23.16b, v25.16b\n"
+ "fmla v18.4s, v30.4s, v3.4s\n"
+ "fmla v0.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "fmla v12.4s, v30.4s, v8.4s\n"
+ "fmla v15.4s, v30.4s, v7.4s\n"
+ "fmla v1.4s, v30.4s, v5.4s\n"
+ "fmla v11.4s, v30.4s, v9.4s\n"
+ "fmla v21.4s, v30.4s, v19.4s\n"
+ "fmla v24.4s, v30.4s, v22.4s\n"
+ "ldr s25, [x25, x27]\n"
+ "fmla v17.4s, v31.4s, v6.4s\n"
+ "ldr x25, [%[inptrs], 0]\n"
+ "fmla v15.4s, v31.4s, v8.4s\n"
+ "fmla v1.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v31.4s, v9.4s\n"
+ "ldr s26, [x16, x27]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 256]\n"
+ "fmla v10.4s, v26.4s, v5.4s\n"
+ "ldr s31, [x15, x27]\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v13.4s, v31.4s, v3.4s\n"
+ "ldr x15, [%[inptrs], 216]\n"
+ "fmla v14.4s, v31.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 176]\n"
+ "fmla v12.4s, v31.4s, v4.4s\n"
+ "fmla v10.4s, v31.4s, v7.4s\n"
+ "fmla v11.4s, v31.4s, v5.4s\n"
+ "fmla v20.4s, v31.4s, v19.4s\n"
+ "fmla v0.4s, v29.4s, v3.4s\n"
+ "ldr s28, [x24, x27]\n"
+ "fmla v15.4s, v29.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 136]\n"
+ "fmla v12.4s, v29.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v8.4s\n"
+ "fmla v11.4s, v29.4s, v7.4s\n"
+ "fmla v21.4s, v29.4s, v5.4s\n"
+ "fmla v20.4s, v29.4s, v9.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v23.4s, v29.4s, v22.4s\n"
+ "ldr s25, [x17, x27]\n"
+ "fmla v17.4s, v28.4s, v3.4s\n"
+ "ldr s29, [x16, x27]\n"
+ "fmla v15.4s, v28.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 264]\n"
+ "fmla v1.4s, v28.4s, v4.4s\n"
+ "ldr x17, [%[inptrs], 48]\n"
+ "fmla v11.4s, v28.4s, v8.4s\n"
+ "fmla v21.4s, v28.4s, v7.4s\n"
+ "fmla v24.4s, v28.4s, v9.4s\n"
+ "ldr s22, [x15, x27]\n"
+ "fmla v14.4s, v29.4s, v3.4s\n"
+ "ldr x15, [%[inptrs], 224]\n"
+ "fmla v1.4s, v25.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v4.4s\n"
+ "fmla v21.4s, v25.4s, v8.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v20.4s, v29.4s, v5.4s\n"
+ "ldr s26, [x24, x27]\n"
+ "fmla v12.4s, v22.4s, v3.4s\n"
+ "ldr x18, [%[inptrs], 184]\n"
+ "fmla v10.4s, v22.4s, v6.4s\n"
+ "ldr x24, [%[inptrs], 96]\n"
+ "fmla v11.4s, v22.4s, v4.4s\n"
+ "fmla v24.4s, v22.4s, v5.4s\n"
+ "fmla v20.4s, v22.4s, v7.4s\n"
+ "fmla v23.4s, v22.4s, v19.4s\n"
+ "fmla v15.4s, v27.4s, v3.4s\n"
+ "ldr s25, [x16, x27]\n"
+ "fmla v21.4s, v27.4s, v4.4s\n"
+ "ldr s31, [x15, x27]\n"
+ "fmla v11.4s, v27.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 272]\n"
+ "fmla v20.4s, v27.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 232]\n"
+ "fmla v24.4s, v27.4s, v7.4s\n"
+ "fmla v23.4s, v27.4s, v9.4s\n"
+ "fmla v1.4s, v26.4s, v3.4s\n"
+ "ldr s22, [x18, x27]\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr s19, [x16, x27]\n"
+ "fmla v10.4s, v25.4s, v3.4s\n"
+ "ldr x16, [%[inptrs], 280]\n"
+ "fmla v24.4s, v26.4s, v8.4s\n"
+ "ldr s28, [x15, x27]\n"
+ "fmla v20.4s, v25.4s, v4.4s\n"
+ "ldr x18, [%[inptrs], 144]\n"
+ "fmla v23.4s, v25.4s, v5.4s\n"
+ "ldr s30, [x16, x27]\n"
+ "fmla v11.4s, v31.4s, v3.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v24.4s, v31.4s, v4.4s\n"
+ "ldr s27, [x25, x27]\n"
+ "fmla v20.4s, v31.4s, v6.4s\n"
+ "ldr x25, [%[inptrs], 8]\n"
+ "fmla v23.4s, v31.4s, v7.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v21.4s, v22.4s, v3.4s\n"
+ "ldr s26, [x17, x27]\n"
+ "fmla v24.4s, v22.4s, v6.4s\n"
+ "ldr x17, [%[inptrs], 56]\n"
+ "fmla v20.4s, v19.4s, v3.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmla v23.4s, v22.4s, v8.4s\n"
+ "ldr s25, [%[wbptr]]\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "ldr s22, [%[wbptr], #4]\n"
+ "str s2, [x20, x28]\n"
+ "fmla v24.4s, v28.4s, v3.4s\n"
+ "fmax v17.4s, v17.4s, v29.4s\n"
+ "ldr s9, [%[wbptr], #8]\n"
+ "fmla v23.4s, v19.4s, v4.4s\n"
+ "ldr s8, [%[wbptr], #12]\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "ldr s19, [%[wbptr], #16]\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 8]\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str s18, [x20, x28]\n"
+ "fmla v23.4s, v28.4s, v6.4s\n"
+ "str s16, [x21, x28]\n"
+ "fmax v21.4s, v21.4s, v29.4s\n"
+ "fmax v13.4s, v13.4s, v29.4s\n"
+ "ldr s7, [%[wbptr], #20]\n"
+ "fmax v12.4s, v12.4s, v29.4s\n"
+ "ldr s5, [%[wbptr], #28]\n"
+ "fmla v23.4s, v30.4s, v3.4s\n"
+ "ldr s6, [%[wbptr], #24]\n"
+ "str s13, [x22, x28]\n"
+ "fmax v11.4s, v11.4s, v29.4s\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "ldr s4, [%[wbptr], #32]\n"
+ "fmax v14.4s, v14.4s, v29.4s\n"
+ "ldr s31, [x25, x27]\n"
+ "fmax v10.4s, v10.4s, v29.4s\n"
+ "ldr s3, [%[wbptr], #36]\n"
+ "fmax v20.4s, v20.4s, v29.4s\n"
+ "ldr s28, [x24, x27]\n"
+ "str s14, [x23, x28]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "mov v2.16b, v25.16b\n"
+ "ldr s29, [x17, x27]\n"
+ "ldr x20, [%[outptrs], 16]\n"
+ "ldr x21, [%[outptrs], 40]\n"
+ "ldr x22, [%[outptrs], 72]\n"
+ "ldr x23, [%[outptrs], 104]\n"
+ "ldr x25, [%[inptrs], 16]\n"
+ "ldr x24, [%[inptrs], 104]\n"
+ "str s17, [x20, x28]\n"
+ "mov v16.16b, v25.16b\n"
+ "str s0, [x21, x28]\n"
+ "mov v18.16b, v25.16b\n"
+ "str s12, [x22, x28]\n"
+ "mov v13.16b, v25.16b\n"
+ "str s10, [x23, x28]\n"
+ "mov v0.16b, v25.16b\n"
+ "fmla v2.4s, v27.4s, v22.4s\n"
+ "ldr s30, [x25, x27]\n"
+ "fmla v16.4s, v26.4s, v22.4s\n"
+ "ldr x20, [%[outptrs], 24]\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr x21, [%[outptrs], 48]\n"
+ "str s1, [x20, x28]\n"
+ "mov v14.16b, v25.16b\n"
+ "str s15, [x21, x28]\n"
+ "mov v12.16b, v25.16b\n"
+ "mov v15.16b, v25.16b\n"
+ "ldr x21, [%[outptrs], 56]\n"
+ "fmla v2.4s, v26.4s, v19.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "str s21, [x21, x28]\n"
+ "ldr x22, [%[outptrs], 80]\n"
+ "ldr s21, [x24, x27]\n"
+ "ldr x23, [%[outptrs], 112]\n"
+ "str s11, [x22, x28]\n"
+ "fmla v2.4s, v31.4s, v9.4s\n"
+ "str s20, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 88]\n"
+ "ldr x23, [%[outptrs], 120]\n"
+ "str s24, [x22, x28]\n"
+ "str s23, [x23, x28]\n"
+ "add x28, x28, #4\n"
+ "bne 5b\n"
+ "6:\n"
+ "mov v1.16b, v25.16b\n"
+ "ldr x17, [%[inptrs], 64]\n"
+ "mov v10.16b, v25.16b\n"
+ "ldr x25, [%[inptrs], 24]\n"
+ "mov v11.16b, v25.16b\n"
+ "ldr x15, [%[inptrs], 192]\n"
+ "fmla v18.4s, v31.4s, v22.4s\n"
+ "ldr s23, [x17, x27]\n"
+ "fmla v2.4s, v28.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 152]\n"
+ "fmla v16.4s, v28.4s, v19.4s\n"
+ "ldr x24, [%[inptrs], 112]\n"
+ "fmla v13.4s, v28.4s, v22.4s\n"
+ "ldr s26, [x25, x27]\n"
+ "fmla v18.4s, v29.4s, v19.4s\n"
+ "ldr x17, [%[inptrs], 72]\n"
+ "fmla v2.4s, v29.4s, v7.4s\n"
+ "ldr x25, [%[inptrs], 32]\n"
+ "fmla v16.4s, v29.4s, v9.4s\n"
+ "ldr x16, [%[inptrs], 240]\n"
+ "fmla v0.4s, v29.4s, v22.4s\n"
+ "ldr s28, [x15, x27]\n"
+ "fmla v18.4s, v30.4s, v9.4s\n"
+ "ldr x15, [%[inptrs], 200]\n"
+ "fmla v2.4s, v30.4s, v8.4s\n"
+ "ldr x20, [%[outptrs], 0]\n"
+ "fmla v17.4s, v30.4s, v22.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v16.4s, v27.4s, v5.4s\n"
+ "ldr x18, [%[inptrs], 160]\n"
+ "fmla v13.4s, v27.4s, v19.4s\n"
+ "ldr x21, [%[outptrs], 32]\n"
+ "fmla v14.4s, v27.4s, v22.4s\n"
+ "ldr s20, [x24, x27]\n"
+ "fmla v2.4s, v21.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 120]\n"
+ "fmla v16.4s, v21.4s, v7.4s\n"
+ "ldr x22, [%[outptrs], 64]\n"
+ "fmla v18.4s, v21.4s, v5.4s\n"
+ "ldr x23, [%[outptrs], 96]\n"
+ "fmla v13.4s, v21.4s, v9.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v0.4s, v21.4s, v19.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v12.4s, v21.4s, v22.4s\n"
+ "ldr s24, [x17, x27]\n"
+ "fmla v2.4s, v23.4s, v6.4s\n"
+ "ldr x17, [%[inptrs], 80]\n"
+ "fmla v16.4s, v23.4s, v8.4s\n"
+ "fmla v18.4s, v23.4s, v7.4s\n"
+ "fmla v0.4s, v23.4s, v9.4s\n"
+ "fmla v17.4s, v23.4s, v19.4s\n"
+ "fmla v15.4s, v23.4s, v22.4s\n"
+ "ldr s23, [x25, x27]\n"
+ "fmla v1.4s, v26.4s, v22.4s\n"
+ "ldr x25, [%[inptrs], 40]\n"
+ "fmla v18.4s, v26.4s, v8.4s\n"
+ "fmla v13.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v26.4s, v9.4s\n"
+ "ldr s30, [x16, x27]\n"
+ "fmla v14.4s, v28.4s, v19.4s\n"
+ "ldr s26, [x15, x27]\n"
+ "fmla v16.4s, v29.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 248]\n"
+ "fmla v13.4s, v29.4s, v7.4s\n"
+ "ldr x15, [%[inptrs], 208]\n"
+ "fmla v0.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v19.4s\n"
+ "fmla v14.4s, v29.4s, v9.4s\n"
+ "fmla v10.4s, v29.4s, v22.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v2.4s, v20.4s, v3.4s\n"
+ "fmla v16.4s, v20.4s, v6.4s\n"
+ "fmla v18.4s, v20.4s, v4.4s\n"
+ "fmla v13.4s, v20.4s, v8.4s\n"
+ "fmla v0.4s, v20.4s, v7.4s\n"
+ "fmla v17.4s, v20.4s, v5.4s\n"
+ "fmla v12.4s, v20.4s, v9.4s\n"
+ "fmla v15.4s, v20.4s, v19.4s\n"
+ "fmla v11.4s, v20.4s, v22.4s\n"
+ "mov v20.16b, v25.16b\n"
+ "fmla v18.4s, v24.4s, v6.4s\n"
+ "fmla v0.4s, v24.4s, v8.4s\n"
+ "fmla v1.4s, v24.4s, v19.4s\n"
+ "fmla v17.4s, v24.4s, v7.4s\n"
+ "fmla v21.4s, v24.4s, v22.4s\n"
+ "fmla v15.4s, v24.4s, v9.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v14.4s, v30.4s, v5.4s\n"
+ "ldr s30, [x24, x27]\n"
+ "fmla v1.4s, v23.4s, v9.4s\n"
+ "ldr x18, [%[inptrs], 168]\n"
+ "fmla v17.4s, v23.4s, v8.4s\n"
+ "ldr s31, [x17, x27]\n"
+ "fmla v13.4s, v26.4s, v4.4s\n"
+ "ldr x24, [%[inptrs], 128]\n"
+ "fmla v14.4s, v26.4s, v7.4s\n"
+ "ldr x17, [%[inptrs], 88]\n"
+ "fmla v12.4s, v26.4s, v5.4s\n"
+ "fmla v10.4s, v26.4s, v19.4s\n"
+ "mov v24.16b, v25.16b\n"
+ "mov v23.16b, v25.16b\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v0.4s, v27.4s, v4.4s\n"
+ "fmla v14.4s, v27.4s, v8.4s\n"
+ "fmla v12.4s, v27.4s, v7.4s\n"
+ "fmla v15.4s, v27.4s, v5.4s\n"
+ "fmla v10.4s, v27.4s, v9.4s\n"
+ "fmla v11.4s, v27.4s, v19.4s\n"
+ "fmla v20.4s, v27.4s, v22.4s\n"
+ "ldr s25, [x25, x27]\n"
+ "fmla v18.4s, v30.4s, v3.4s\n"
+ "fmla v0.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v4.4s\n"
+ "fmla v12.4s, v30.4s, v8.4s\n"
+ "fmla v15.4s, v30.4s, v7.4s\n"
+ "fmla v1.4s, v30.4s, v5.4s\n"
+ "fmla v11.4s, v30.4s, v9.4s\n"
+ "fmla v21.4s, v30.4s, v19.4s\n"
+ "fmla v24.4s, v30.4s, v22.4s\n"
+ "ldr s26, [x16, x27]\n"
+ "fmla v17.4s, v31.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 256]\n"
+ "fmla v15.4s, v31.4s, v8.4s\n"
+ "fmla v1.4s, v31.4s, v7.4s\n"
+ "fmla v21.4s, v31.4s, v9.4s\n"
+ "ldr s31, [x15, x27]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "ldr x15, [%[inptrs], 216]\n"
+ "fmla v10.4s, v26.4s, v5.4s\n"
+ "ldr s29, [x18, x27]\n"
+ "fmla v1.4s, v25.4s, v8.4s\n"
+ "ldr s28, [x24, x27]\n"
+ "fmla v13.4s, v31.4s, v3.4s\n"
+ "ldr x18, [%[inptrs], 176]\n"
+ "fmla v14.4s, v31.4s, v6.4s\n"
+ "ldr x24, [%[inptrs], 136]\n"
+ "fmla v12.4s, v31.4s, v4.4s\n"
+ "fmla v10.4s, v31.4s, v7.4s\n"
+ "fmla v11.4s, v31.4s, v5.4s\n"
+ "fmla v20.4s, v31.4s, v19.4s\n"
+ "fmla v0.4s, v29.4s, v3.4s\n"
+ "ldr s25, [x17, x27]\n"
+ "fmla v15.4s, v29.4s, v4.4s\n"
+ "fmla v21.4s, v29.4s, v5.4s\n"
+ "fmla v12.4s, v29.4s, v6.4s\n"
+ "fmla v10.4s, v29.4s, v8.4s\n"
+ "fmla v11.4s, v29.4s, v7.4s\n"
+ "fmla v20.4s, v29.4s, v9.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v23.4s, v29.4s, v22.4s\n"
+ "fmla v17.4s, v28.4s, v3.4s\n"
+ "ldr s29, [x16, x27]\n"
+ "fmla v15.4s, v28.4s, v6.4s\n"
+ "ldr s22, [x15, x27]\n"
+ "fmla v1.4s, v28.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 264]\n"
+ "fmla v11.4s, v28.4s, v8.4s\n"
+ "ldr x15, [%[inptrs], 224]\n"
+ "fmla v21.4s, v28.4s, v7.4s\n"
+ "fmla v24.4s, v28.4s, v9.4s\n"
+ "fmla v14.4s, v29.4s, v3.4s\n"
+ "ldr s27, [x18, x27]\n"
+ "fmla v1.4s, v25.4s, v6.4s\n"
+ "ldr x18, [%[inptrs], 184]\n"
+ "fmla v10.4s, v29.4s, v4.4s\n"
+ "fmla v20.4s, v29.4s, v5.4s\n"
+ "fmla v21.4s, v25.4s, v8.4s\n"
+ "ldr s26, [x24, x27]\n"
+ "fmla v12.4s, v22.4s, v3.4s\n"
+ "ldr s25, [x16, x27]\n"
+ "fmla v11.4s, v22.4s, v4.4s\n"
+ "ldr x16, [%[inptrs], 272]\n"
+ "fmla v10.4s, v22.4s, v6.4s\n"
+ "fmla v20.4s, v22.4s, v7.4s\n"
+ "fmla v24.4s, v22.4s, v5.4s\n"
+ "fmla v23.4s, v22.4s, v19.4s\n"
+ "fmla v15.4s, v27.4s, v3.4s\n"
+ "ldr s31, [x15, x27]\n"
+ "fmla v11.4s, v27.4s, v6.4s\n"
+ "ldr s22, [x18, x27]\n"
+ "fmla v21.4s, v27.4s, v4.4s\n"
+ "ldr x15, [%[inptrs], 232]\n"
+ "fmla v20.4s, v27.4s, v8.4s\n"
+ "fmla v24.4s, v27.4s, v7.4s\n"
+ "fmla v23.4s, v27.4s, v9.4s\n"
+ "ldr s19, [x16, x27]\n"
+ "fmla v1.4s, v26.4s, v3.4s\n"
+ "ldr s28, [x15, x27]\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr x16, [%[inptrs], 280]\n"
+ "fmla v24.4s, v26.4s, v8.4s\n"
+ "fmla v10.4s, v25.4s, v3.4s\n"
+ "fmla v20.4s, v25.4s, v4.4s\n"
+ "ldr s30, [x16, x27]\n"
+ "fmla v23.4s, v25.4s, v5.4s\n"
+ "add x27, x27, #4\n"
+ "fmla v11.4s, v31.4s, v3.4s\n"
+ "fmla v21.4s, v22.4s, v3.4s\n"
+ "fmla v24.4s, v31.4s, v4.4s\n"
+ "movi v29.16b, #0\n"
+ "fmla v20.4s, v31.4s, v6.4s\n"
+ "fmla v23.4s, v31.4s, v7.4s\n"
+ "fmax v2.4s, v2.4s, v29.4s\n"
+ "fmax v18.4s, v18.4s, v29.4s\n"
+ "fmla v24.4s, v22.4s, v6.4s\n"
+ "fmax v17.4s, v17.4s, v29.4s\n"
+ "fmla v20.4s, v19.4s, v3.4s\n"
+ "fmax v1.4s, v1.4s, v29.4s\n"
+ "str s2, [x20, x28]\n"
+ "fmla v23.4s, v22.4s, v8.4s\n"
+ "fmax v16.4s, v16.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 8]\n"
+ "fmla v24.4s, v28.4s, v3.4s\n"
+ "fmax v0.4s, v0.4s, v29.4s\n"
+ "str s18, [x20, x28]\n"
+ "fmax v15.4s, v15.4s, v29.4s\n"
+ "str s16, [x21, x28]\n"
+ "fmla v23.4s, v19.4s, v4.4s\n"
+ "fmax v21.4s, v21.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 16]\n"
+ "fmax v13.4s, v13.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 40]\n"
+ "str s17, [x20, x28]\n"
+ "fmax v12.4s, v12.4s, v29.4s\n"
+ "str s0, [x21, x28]\n"
+ "fmla v23.4s, v28.4s, v6.4s\n"
+ "str s13, [x22, x28]\n"
+ "fmax v11.4s, v11.4s, v29.4s\n"
+ "fmax v24.4s, v24.4s, v29.4s\n"
+ "ldr x20, [%[outptrs], 24]\n"
+ "fmax v14.4s, v14.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 48]\n"
+ "str s1, [x20, x28]\n"
+ "fmla v23.4s, v30.4s, v3.4s\n"
+ "str s15, [x21, x28]\n"
+ "fmax v10.4s, v10.4s, v29.4s\n"
+ "str s14, [x23, x28]\n"
+ "fmax v20.4s, v20.4s, v29.4s\n"
+ "ldr x21, [%[outptrs], 56]\n"
+ "ldr x22, [%[outptrs], 72]\n"
+ "ldr x23, [%[outptrs], 104]\n"
+ "fmax v23.4s, v23.4s, v29.4s\n"
+ "str s21, [x21, x28]\n"
+ "str s12, [x22, x28]\n"
+ "str s10, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 80]\n"
+ "ldr x23, [%[outptrs], 112]\n"
+ "str s11, [x22, x28]\n"
+ "str s20, [x23, x28]\n"
+ "ldr x22, [%[outptrs], 88]\n"
+ "ldr x23, [%[outptrs], 120]\n"
+ "str s24, [x22, x28]\n"
+ "str s23, [x23, x28]\n"
+ "add x28, x28, #4\n"
+ "7:\n"
+ : [wbptr] "+r" (weight_bias_ptr)
+ : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
- "vptr1 .req x5\n"
- "vptr2 .req x6\n"
- "vptr3 .req x7\n"
-
- "wptr1 .req x8\n"
- "wptr2 .req x9\n"
-
- // Prepare pointers and strides
- "add uptr1, %x[uptr0], %x[u_row_stride]\n"
- "add uptr2, uptr1 , %x[u_row_stride]\n"
- "add uptr3, uptr2 , %x[u_row_stride]\n"
- "add uptr4, uptr3 , %x[u_row_stride]\n"
- "add uptr5, uptr4 , %x[u_row_stride]\n"
-
- "add vptr1, %x[vptr0], %x[v_row_stride]\n"
- "add vptr2, vptr1 , %x[v_row_stride]\n"
- "add vptr3, vptr2 , %x[v_row_stride]\n"
-
- "add wptr1, %x[wptr0], %x[w_row_stride]\n"
- "add wptr2, wptr1 , %x[w_row_stride]\n"
-
- // Load initial operands
- "ldr qU16, [%x[uptr0], %x[uvw_col_stride5]]\n"
- "ldr qW13, [%x[wptr0], %x[uvw_col_stride2]]\n"
- "subs %x[c4_rem], %x[c4_rem], #1\n"
- "ldr qU15, [%x[uptr0], %x[uvw_col_stride4]]\n"
- "ldr qW23, [wptr1, %x[uvw_col_stride2]]\n"
- "ldr qU14, [%x[uptr0], %x[uvw_col_stride3]]\n"
- "ldr qW33, [wptr2, %x[uvw_col_stride2]]\n"
- "ldr qU26, [uptr1, %x[uvw_col_stride5]]\n"
- "ldr qW12, [%x[wptr0], %x[uvw_col_stride1]]\n"
- "ldr qU25, [uptr1, %x[uvw_col_stride4]]\n"
- "ldr qW22, [wptr1, %x[uvw_col_stride1]]\n"
- "ldr qU36, [uptr2, %x[uvw_col_stride5]]\n"
- "ldr qW32, [wptr2, %x[uvw_col_stride1]]\n"
- "ldr qW11, [%x[wptr0]], #0x10\n"
- "fmul vV14.4s, vU16.4s, vW13.4s\n"
- "ldr qU24, [uptr1, %x[uvw_col_stride3]]\n"
- "fmul vV13.4s, vU15.4s, vW13.4s\n"
- "ldr qW31, [wptr2], #0x10\n"
- "fmla vV14.4s, vU15.4s, vW12.4s\n"
- "ldr qW21, [wptr1], #0x10\n"
- "fmul vV12.4s, vU14.4s, vW13.4s\n"
- "ldr qU34, [uptr2, %x[uvw_col_stride3]]\n"
- "fmla vV13.4s, vU14.4s, vW12.4s\n"
- "ldr qU46, [uptr3, %x[uvw_col_stride5]]\n"
- "fmla vV14.4s, vU14.4s, vW11.4s\n"
- "ldr qU45, [uptr3, %x[uvw_col_stride4]]\n"
- "fmla vV14.4s, vU26.4s, vW23.4s\n"
- "ldr qU35, [uptr2, %x[uvw_col_stride4]]\n"
- "fmul vV24.4s, vU26.4s, vW13.4s\n"
- "ldr qU44, [uptr3, %x[uvw_col_stride3]]\n"
- "fmla vV13.4s, vU25.4s, vW23.4s\n"
- "beq 2f\n" // Single iteration only
-
- "1:" // Loop body
- "fmla vV14.4s, vU25.4s, vW22.4s\n"
- "prfm pldl1keep, [%x[wptr0], %[prftch]]\n"
- "fmul vV23.4s, vU25.4s, vW13.4s\n"
- "prfm pldl1keep, [%x[wptr0], %x[prftch_uvw_col_stride1]]\n"
- "fmla vV24.4s, vU25.4s, vW12.4s\n"
- "ldr qU56, [uptr4, %x[uvw_col_stride5]]\n"
- "fmla vV12.4s, vU24.4s, vW23.4s\n"
- "prfm pldl1keep, [%x[wptr0], %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV13.4s, vU24.4s, vW22.4s\n"
- "prfm pldl1keep, [ wptr1 , %[prftch]]\n"
- "fmla vV14.4s, vU24.4s, vW21.4s\n"
- "prfm pldl1keep, [ wptr1 , %x[prftch_uvw_col_stride1]]\n"
- "fmul vV22.4s, vU24.4s, vW13.4s\n"
- "prfm pldl1keep, [ wptr1 , %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV23.4s, vU24.4s, vW12.4s\n"
- "prfm pldl1keep, [ wptr2 , %x[prftch]]\n"
- "fmla vV24.4s, vU24.4s, vW11.4s\n"
- "ldr qU55, [uptr4, %x[uvw_col_stride4]]\n"
- "fmla vV14.4s, vU36.4s, vW33.4s\n"
- "prfm pldl1keep, [ wptr2 , %x[prftch_uvw_col_stride1]]\n"
- "fmla vV24.4s, vU36.4s, vW23.4s\n"
- "prfm pldl1keep, [ wptr2 , %x[prftch_uvw_col_stride2] ]\n"
- "fmul vV34.4s, vU36.4s, vW13.4s\n"
- "ldr qU54, [uptr4, %x[uvw_col_stride3]]\n"
- "fmla vV13.4s, vU35.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride1]]\n"
- "fmla vV14.4s, vU35.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV23.4s, vU35.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV24.4s, vU35.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride4] ]\n"
- "fmul vV33.4s, vU35.4s, vW13.4s\n"
- "prfm pldl1keep, [ uptr2 , %x[prftch_uvw_col_stride5] ]\n"
- "fmla vV34.4s, vU35.4s, vW12.4s\n"
- "ldr qU66, [uptr5, %x[uvw_col_stride5]]\n"
- "fmla vV12.4s, vU34.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr3 , %[prftch]]\n"
- "fmla vV13.4s, vU34.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride1]]\n"
- "fmla vV14.4s, vU34.4s, vW31.4s\n"
- "str qV14, [%x[vptr0], %x[uvw_col_stride3]]\n"
- "fmla vV22.4s, vU34.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV23.4s, vU34.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV24.4s, vU34.4s, vW21.4s\n"
- "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride4] ]\n"
- "fmul vV32.4s, vU34.4s, vW13.4s\n"
- "prfm pldl1keep, [ uptr3 , %x[prftch_uvw_col_stride5] ]\n"
- "fmla vV33.4s, vU34.4s, vW12.4s\n"
- "prfm pldl1keep, [ uptr4 , %[prftch]]\n"
- "fmla vV34.4s, vU34.4s, vW11.4s\n"
- "ldr qU65, [uptr5, %x[uvw_col_stride4]]\n"
- "fmla vV24.4s, vU46.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride1]]\n"
- "fmla vV34.4s, vU46.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride2] ]\n"
- "fmul vV44.4s, vU46.4s, vW13.4s\n"
- "ldr qU64, [uptr5, %x[uvw_col_stride3]]\n"
- "fmla vV23.4s, vU45.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV24.4s, vU45.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride4] ]\n"
- "fmla vV33.4s, vU45.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr4 , %x[prftch_uvw_col_stride5] ]\n"
- "fmla vV34.4s, vU45.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr5 , %[prftch]]\n"
- "fmul vV43.4s, vU45.4s, vW13.4s\n"
- "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride1]]\n"
- "fmla vV44.4s, vU45.4s, vW12.4s\n"
- "ldr qU13, [%x[uptr0], %x[uvw_col_stride2]]\n"
- "fmla vV22.4s, vU44.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV23.4s, vU44.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV24.4s, vU44.4s, vW31.4s\n"
- "str qV24, [vptr1, %x[uvw_col_stride3]]\n"
- "fmla vV32.4s, vU44.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride4] ]\n"
- "fmla vV33.4s, vU44.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr5 , %x[prftch_uvw_col_stride5] ]\n"
- "fmla vV34.4s, vU44.4s, vW21.4s\n"
- "prfm pstl1keep, [%x[vptr0], %[prftch]]\n"
- "fmul vV42.4s, vU44.4s, vW13.4s\n"
- "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride1]]\n"
- "fmla vV43.4s, vU44.4s, vW12.4s\n"
- "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV44.4s, vU44.4s, vW11.4s\n"
- "ldr qU23, [uptr1, %x[uvw_col_stride2]]\n"
- "fmla vV34.4s, vU56.4s, vW33.4s\n"
- "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV44.4s, vU56.4s, vW23.4s\n"
- "ldr qU33, [uptr2, %x[uvw_col_stride2]]\n"
- "fmla vV33.4s, vU55.4s, vW33.4s\n"
- "prfm pstl1keep, [ vptr1 , %[prftch]]\n"
- "fmla vV34.4s, vU55.4s, vW32.4s\n"
- "prfm pstl1keep, [ vptr1 , %x[prftch_uvw_col_stride1]]\n"
- "fmla vV43.4s, vU55.4s, vW23.4s\n"
- "prfm pstl1keep, [ vptr1 , %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV44.4s, vU55.4s, vW22.4s\n"
- "ldr qU43, [uptr3, %x[uvw_col_stride2]]\n"
- "fmla vV32.4s, vU54.4s, vW33.4s\n"
- "prfm pstl1keep, [ vptr1 , %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV33.4s, vU54.4s, vW32.4s\n"
- "prfm pstl1keep, [ vptr2 , %[prftch]]\n"
- "fmla vV34.4s, vU54.4s, vW31.4s\n"
- "str qV34, [vptr2, %x[uvw_col_stride3]]\n"
- "fmla vV42.4s, vU54.4s, vW23.4s\n"
- "prfm pstl1keep, [ vptr2 , %x[prftch_uvw_col_stride1]]\n"
- "fmla vV43.4s, vU54.4s, vW22.4s\n"
- "prfm pstl1keep, [ vptr2 , %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV44.4s, vU54.4s, vW21.4s\n"
- "ldr qU53, [uptr4, %x[uvw_col_stride2]]\n"
- "fmla vV44.4s, vU66.4s, vW33.4s\n"
- "ldr qU63, [uptr5, %x[uvw_col_stride2]]\n"
- "fmla vV43.4s, vU65.4s, vW33.4s\n"
- "prfm pstl1keep, [ vptr2 , %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV44.4s, vU65.4s, vW32.4s\n"
- "ldr qU12, [%x[uptr0], %x[uvw_col_stride1]]\n"
- "fmla vV42.4s, vU64.4s, vW33.4s\n"
- "prfm pstl1keep, [ vptr3 , %[prftch]]\n"
- "fmla vV43.4s, vU64.4s, vW32.4s\n"
- "prfm pstl1keep, [ vptr3 , %x[prftch_uvw_col_stride1]]\n"
- "fmla vV44.4s, vU64.4s, vW31.4s\n"
- "str qV44, [vptr3, %x[uvw_col_stride3]]\n"
- "fmul vV11.4s, vU13.4s, vW13.4s\n"
- "ldr qU22, [uptr1, %x[uvw_col_stride1]]\n"
- "fmla vV12.4s, vU13.4s, vW12.4s\n"
- "prfm pstl1keep, [ vptr3 , %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV13.4s, vU13.4s, vW11.4s\n"
- "ldr qU32, [uptr2, %x[uvw_col_stride1]]\n"
- "fmla vV11.4s, vU23.4s, vW23.4s\n"
- "prfm pstl1keep, [ vptr3 , %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV12.4s, vU23.4s, vW22.4s\n"
- "fmla vV13.4s, vU23.4s, vW21.4s\n"
- "fmul vV21.4s, vU23.4s, vW13.4s\n"
- "fmla vV22.4s, vU23.4s, vW12.4s\n"
- "fmla vV23.4s, vU23.4s, vW11.4s\n"
- "ldr qU42, [uptr3, %x[uvw_col_stride1]]\n"
- "fmla vV11.4s, vU33.4s, vW33.4s\n"
- "fmla vV12.4s, vU33.4s, vW32.4s\n"
- "fmla vV13.4s, vU33.4s, vW31.4s\n"
- "str qV13, [%x[vptr0], %x[uvw_col_stride2]]\n"
- "fmla vV21.4s, vU33.4s, vW23.4s\n"
- "fmla vV22.4s, vU33.4s, vW22.4s\n"
- "fmla vV23.4s, vU33.4s, vW21.4s\n"
- "fmul vV31.4s, vU33.4s, vW13.4s\n"
- "fmla vV32.4s, vU33.4s, vW12.4s\n"
- "fmla vV33.4s, vU33.4s, vW11.4s\n"
- "ldr qU52, [uptr4, %x[uvw_col_stride1]]\n"
- "fmla vV21.4s, vU43.4s, vW33.4s\n"
- "fmla vV22.4s, vU43.4s, vW32.4s\n"
- "fmla vV23.4s, vU43.4s, vW31.4s\n"
- "str qV23, [vptr1, %x[uvw_col_stride2]]\n"
- "fmla vV31.4s, vU43.4s, vW23.4s\n"
- "fmla vV32.4s, vU43.4s, vW22.4s\n"
- "fmla vV33.4s, vU43.4s, vW21.4s\n"
- "fmul vV41.4s, vU43.4s, vW13.4s\n"
- "ldr qW13, [%x[wptr0], %x[uvw_col_stride2]]\n"
- "fmla vV42.4s, vU43.4s, vW12.4s\n"
- "fmla vV43.4s, vU43.4s, vW11.4s\n"
- "ldr qU62, [uptr5, %x[uvw_col_stride1]]\n"
- "fmla vV31.4s, vU53.4s, vW33.4s\n"
- "fmla vV32.4s, vU53.4s, vW32.4s\n"
- "fmla vV33.4s, vU53.4s, vW31.4s\n"
- "str qV33, [vptr2, %x[uvw_col_stride2]]\n"
- "fmla vV41.4s, vU53.4s, vW23.4s\n"
- "ldr qW23, [wptr1, %x[uvw_col_stride2]]\n"
- "fmla vV42.4s, vU53.4s, vW22.4s\n"
- "fmla vV43.4s, vU53.4s, vW21.4s\n"
- "ldr qU11, [%x[uptr0]], #0x10\n"
- "fmla vV41.4s, vU63.4s, vW33.4s\n"
- "ldr qW33, [wptr2, %x[uvw_col_stride2]]\n"
- "fmla vV42.4s, vU63.4s, vW32.4s\n"
- "prfm pldl1keep, [%x[uptr0], %[prftch]]\n"
- "fmla vV43.4s, vU63.4s, vW31.4s\n"
- "str qV43, [vptr3, %x[uvw_col_stride2]]\n"
- "fmla vV11.4s, vU12.4s, vW12.4s\n"
- "ldr qU21, [uptr1], #0x10\n"
- "fmla vV12.4s, vU12.4s, vW11.4s\n"
- "ldr qU31, [uptr2], #0x10\n"
- "fmla vV11.4s, vU22.4s, vW22.4s\n"
- "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride1]]\n"
- "fmla vV12.4s, vU22.4s, vW21.4s\n"
- "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV21.4s, vU22.4s, vW12.4s\n"
- "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV22.4s, vU22.4s, vW11.4s\n"
- "ldr qU41, [uptr3], #0x10\n"
- "fmla vV11.4s, vU32.4s, vW32.4s\n"
- "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride4] ]\n"
- "fmla vV12.4s, vU32.4s, vW31.4s\n"
- "str qV12, [%x[vptr0], %x[uvw_col_stride1]]\n"
- "fmla vV21.4s, vU32.4s, vW22.4s\n"
- "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride5] ]\n"
- "fmla vV22.4s, vU32.4s, vW21.4s\n"
- "prfm pldl1keep, [ uptr1 , %[prftch]]\n"
- "fmla vV31.4s, vU32.4s, vW12.4s\n"
- "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride1]]\n"
- "fmla vV32.4s, vU32.4s, vW11.4s\n"
- "ldr qU51, [uptr4], #0x10\n"
- "fmla vV21.4s, vU42.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride2] ]\n"
- "fmla vV22.4s, vU42.4s, vW31.4s\n"
- "str qV22, [vptr1, %x[uvw_col_stride1]]\n"
- "fmla vV31.4s, vU42.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride3] ]\n"
- "fmla vV32.4s, vU42.4s, vW21.4s\n"
- "subs %x[c4_rem], %x[c4_rem], #1\n"
- "fmla vV41.4s, vU42.4s, vW12.4s\n"
- "ldr qW12, [%x[wptr0], %x[uvw_col_stride1]]\n"
- "fmla vV42.4s, vU42.4s, vW11.4s\n"
- "ldr qU61, [uptr5], #0x10\n"
- "fmla vV31.4s, vU52.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride4] ]\n"
- "fmla vV32.4s, vU52.4s, vW31.4s\n"
- "str qV32, [vptr2, %x[uvw_col_stride1]]\n"
- "fmla vV41.4s, vU52.4s, vW22.4s\n"
- "ldr qW22, [wptr1, %x[uvw_col_stride1]]\n"
- "fmla vV42.4s, vU52.4s, vW21.4s\n"
- "ldr qU16, [%x[uptr0], %x[uvw_col_stride5]]\n"
- "fmla vV41.4s, vU62.4s, vW32.4s\n"
- "ldr qW32, [wptr2, %x[uvw_col_stride1]]\n"
- "fmla vV42.4s, vU62.4s, vW31.4s\n"
- "str qV42, [vptr3, %x[uvw_col_stride1]]\n"
- "fmla vV11.4s, vU11.4s, vW11.4s\n"
- "ldr qU15, [%x[uptr0], %x[uvw_col_stride4]]\n"
- "fmla vV11.4s, vU21.4s, vW21.4s\n"
- "ldr qU14, [%x[uptr0], %x[uvw_col_stride3]]\n"
- "fmla vV21.4s, vU21.4s, vW11.4s\n"
- "ldr qU26, [uptr1, %x[uvw_col_stride5]]\n"
- "fmla vV11.4s, vU31.4s, vW31.4s\n"
- "str qV11, [%x[vptr0]], #0x10\n"
- "fmla vV21.4s, vU31.4s, vW21.4s\n"
- "prfm pldl1keep, [ uptr1 , %x[prftch_uvw_col_stride5] ]\n"
- "fmla vV31.4s, vU31.4s, vW11.4s\n"
- "ldr qU25, [uptr1, %x[uvw_col_stride4]]\n"
- "fmla vV21.4s, vU41.4s, vW31.4s\n"
- "str qV21, [vptr1], #0x10\n"
- "fmla vV31.4s, vU41.4s, vW21.4s\n"
- "prfm pldl1keep, [ uptr2 , %[prftch]]\n"
- "fmla vV41.4s, vU41.4s, vW11.4s\n"
- "ldr qW11, [%x[wptr0]], #0x10\n"
- "fmla vV31.4s, vU51.4s, vW31.4s\n"
- "str qV31, [vptr2], #0x10\n"
- "fmla vV41.4s, vU51.4s, vW21.4s\n"
- "ldr qU36, [uptr2, %x[uvw_col_stride5]]\n"
- "fmla vV41.4s, vU61.4s, vW31.4s\n"
- "str qV41, [vptr3], #0x10\n"
- "fmul vV14.4s, vU16.4s, vW13.4s\n"
- "ldr qU24, [uptr1, %x[uvw_col_stride3]]\n"
- "fmul vV13.4s, vU15.4s, vW13.4s\n"
- "ldr qW31, [wptr2], #0x10\n"
- "fmla vV14.4s, vU15.4s, vW12.4s\n"
- "ldr qW21, [wptr1], #0x10\n"
- "fmul vV12.4s, vU14.4s, vW13.4s\n"
- "ldr qU34, [uptr2, %x[uvw_col_stride3]]\n"
- "fmla vV13.4s, vU14.4s, vW12.4s\n"
- "ldr qU46, [uptr3, %x[uvw_col_stride5]]\n"
- "fmla vV14.4s, vU14.4s, vW11.4s\n"
- "ldr qU45, [uptr3, %x[uvw_col_stride4]]\n"
- "fmla vV14.4s, vU26.4s, vW23.4s\n"
- "ldr qU35, [uptr2, %x[uvw_col_stride4]]\n"
- "fmul vV24.4s, vU26.4s, vW13.4s\n"
- "ldr qU44, [uptr3, %x[uvw_col_stride3]]\n"
- "fmla vV13.4s, vU25.4s, vW23.4s\n"
- "bne 1b\n"
-
- "2:" // Final iteration
- "fmla vV14.4s, vU25.4s, vW22.4s\n"
- "fmul vV23.4s, vU25.4s, vW13.4s\n"
- "fmla vV24.4s, vU25.4s, vW12.4s\n"
- "ldr qU56, [uptr4, %x[uvw_col_stride5]]\n"
- "fmla vV12.4s, vU24.4s, vW23.4s\n"
- "fmla vV13.4s, vU24.4s, vW22.4s\n"
- "fmla vV14.4s, vU24.4s, vW21.4s\n"
- "fmul vV22.4s, vU24.4s, vW13.4s\n"
- "fmla vV23.4s, vU24.4s, vW12.4s\n"
- "fmla vV24.4s, vU24.4s, vW11.4s\n"
- "ldr qU55, [uptr4, %x[uvw_col_stride4]]\n"
- "fmla vV14.4s, vU36.4s, vW33.4s\n"
- "fmla vV24.4s, vU36.4s, vW23.4s\n"
- "fmul vV34.4s, vU36.4s, vW13.4s\n"
- "ldr qU54, [uptr4, %x[uvw_col_stride3]]\n"
- "fmla vV13.4s, vU35.4s, vW33.4s\n"
- "fmla vV14.4s, vU35.4s, vW32.4s\n"
- "fmla vV23.4s, vU35.4s, vW23.4s\n"
- "fmla vV24.4s, vU35.4s, vW22.4s\n"
- "fmul vV33.4s, vU35.4s, vW13.4s\n"
- "fmla vV34.4s, vU35.4s, vW12.4s\n"
- "ldr qU66, [uptr5, %x[uvw_col_stride5]]\n"
- "fmla vV12.4s, vU34.4s, vW33.4s\n"
- "fmla vV13.4s, vU34.4s, vW32.4s\n"
- "fmla vV14.4s, vU34.4s, vW31.4s\n"
- "str qV14, [%x[vptr0], %x[uvw_col_stride3]]\n"
- "fmla vV22.4s, vU34.4s, vW23.4s\n"
- "fmla vV23.4s, vU34.4s, vW22.4s\n"
- "fmla vV24.4s, vU34.4s, vW21.4s\n"
- "fmul vV32.4s, vU34.4s, vW13.4s\n"
- "fmla vV33.4s, vU34.4s, vW12.4s\n"
- "fmla vV34.4s, vU34.4s, vW11.4s\n"
- "ldr qU65, [uptr5, %x[uvw_col_stride4]]\n"
- "fmla vV24.4s, vU46.4s, vW33.4s\n"
- "fmla vV34.4s, vU46.4s, vW23.4s\n"
- "fmul vV44.4s, vU46.4s, vW13.4s\n"
- "ldr qU64, [uptr5, %x[uvw_col_stride3]]\n"
- "fmla vV23.4s, vU45.4s, vW33.4s\n"
- "fmla vV24.4s, vU45.4s, vW32.4s\n"
- "fmla vV33.4s, vU45.4s, vW23.4s\n"
- "fmla vV34.4s, vU45.4s, vW22.4s\n"
- "fmul vV43.4s, vU45.4s, vW13.4s\n"
- "fmla vV44.4s, vU45.4s, vW12.4s\n"
- "ldr qU13, [%x[uptr0], %x[uvw_col_stride2]]\n"
- "fmla vV22.4s, vU44.4s, vW33.4s\n"
- "fmla vV23.4s, vU44.4s, vW32.4s\n"
- "fmla vV24.4s, vU44.4s, vW31.4s\n"
- "str qV24, [vptr1, %x[uvw_col_stride3]]\n"
- "fmla vV32.4s, vU44.4s, vW23.4s\n"
- "fmla vV33.4s, vU44.4s, vW22.4s\n"
- "fmla vV34.4s, vU44.4s, vW21.4s\n"
- "fmul vV42.4s, vU44.4s, vW13.4s\n"
- "fmla vV43.4s, vU44.4s, vW12.4s\n"
- "fmla vV44.4s, vU44.4s, vW11.4s\n"
- "ldr qU23, [uptr1, %x[uvw_col_stride2]]\n"
- "fmla vV34.4s, vU56.4s, vW33.4s\n"
- "fmla vV44.4s, vU56.4s, vW23.4s\n"
- "ldr qU33, [uptr2, %x[uvw_col_stride2]]\n"
- "fmla vV33.4s, vU55.4s, vW33.4s\n"
- "fmla vV34.4s, vU55.4s, vW32.4s\n"
- "fmla vV43.4s, vU55.4s, vW23.4s\n"
- "fmla vV44.4s, vU55.4s, vW22.4s\n"
- "ldr qU43, [uptr3, %x[uvw_col_stride2]]\n"
- "fmla vV32.4s, vU54.4s, vW33.4s\n"
- "fmla vV33.4s, vU54.4s, vW32.4s\n"
- "fmla vV34.4s, vU54.4s, vW31.4s\n"
- "str qV34, [vptr2, %x[uvw_col_stride3]]\n"
- "fmla vV42.4s, vU54.4s, vW23.4s\n"
- "fmla vV43.4s, vU54.4s, vW22.4s\n"
- "fmla vV44.4s, vU54.4s, vW21.4s\n"
- "ldr qU53, [uptr4, %x[uvw_col_stride2]]\n"
- "fmla vV44.4s, vU66.4s, vW33.4s\n"
- "ldr qU63, [uptr5, %x[uvw_col_stride2]]\n"
- "fmla vV43.4s, vU65.4s, vW33.4s\n"
- "fmla vV44.4s, vU65.4s, vW32.4s\n"
- "ldr qU12, [%x[uptr0], %x[uvw_col_stride1]]\n"
- "fmla vV42.4s, vU64.4s, vW33.4s\n"
- "fmla vV43.4s, vU64.4s, vW32.4s\n"
- "fmla vV44.4s, vU64.4s, vW31.4s\n"
- "str qV44, [vptr3, %x[uvw_col_stride3]]\n"
- "fmul vV11.4s, vU13.4s, vW13.4s\n"
- "ldr qU22, [uptr1, %x[uvw_col_stride1]]\n"
- "fmla vV12.4s, vU13.4s, vW12.4s\n"
- "fmla vV13.4s, vU13.4s, vW11.4s\n"
- "ldr qU32, [uptr2, %x[uvw_col_stride1]]\n"
- "fmla vV11.4s, vU23.4s, vW23.4s\n"
- "fmla vV12.4s, vU23.4s, vW22.4s\n"
- "fmla vV13.4s, vU23.4s, vW21.4s\n"
- "fmul vV21.4s, vU23.4s, vW13.4s\n"
- "fmla vV22.4s, vU23.4s, vW12.4s\n"
- "fmla vV23.4s, vU23.4s, vW11.4s\n"
- "ldr qU42, [uptr3, %x[uvw_col_stride1]]\n"
- "fmla vV11.4s, vU33.4s, vW33.4s\n"
- "fmla vV12.4s, vU33.4s, vW32.4s\n"
- "fmla vV13.4s, vU33.4s, vW31.4s\n"
- "str qV13, [%x[vptr0], %x[uvw_col_stride2]]\n"
- "fmla vV21.4s, vU33.4s, vW23.4s\n"
- "fmla vV22.4s, vU33.4s, vW22.4s\n"
- "fmla vV23.4s, vU33.4s, vW21.4s\n"
- "fmul vV31.4s, vU33.4s, vW13.4s\n"
- "fmla vV32.4s, vU33.4s, vW12.4s\n"
- "fmla vV33.4s, vU33.4s, vW11.4s\n"
- "ldr qU52, [uptr4, %x[uvw_col_stride1]]\n"
- "fmla vV21.4s, vU43.4s, vW33.4s\n"
- "fmla vV22.4s, vU43.4s, vW32.4s\n"
- "fmla vV23.4s, vU43.4s, vW31.4s\n"
- "str qV23, [vptr1, %x[uvw_col_stride2]]\n"
- "fmla vV31.4s, vU43.4s, vW23.4s\n"
- "fmla vV32.4s, vU43.4s, vW22.4s\n"
- "fmla vV33.4s, vU43.4s, vW21.4s\n"
- "fmul vV41.4s, vU43.4s, vW13.4s\n"
- "fmla vV42.4s, vU43.4s, vW12.4s\n"
- "fmla vV43.4s, vU43.4s, vW11.4s\n"
- "ldr qU62, [uptr5, %x[uvw_col_stride1]]\n"
- "fmla vV31.4s, vU53.4s, vW33.4s\n"
- "fmla vV32.4s, vU53.4s, vW32.4s\n"
- "fmla vV33.4s, vU53.4s, vW31.4s\n"
- "str qV33, [vptr2, %x[uvw_col_stride2]]\n"
- "fmla vV41.4s, vU53.4s, vW23.4s\n"
- "fmla vV42.4s, vU53.4s, vW22.4s\n"
- "fmla vV43.4s, vU53.4s, vW21.4s\n"
- "ldr qU11, [%x[uptr0]], #0x10\n"
- "fmla vV41.4s, vU63.4s, vW33.4s\n"
- "fmla vV42.4s, vU63.4s, vW32.4s\n"
- "fmla vV43.4s, vU63.4s, vW31.4s\n"
- "str qV43, [vptr3, %x[uvw_col_stride2]]\n"
- "fmla vV11.4s, vU12.4s, vW12.4s\n"
- "ldr qU21, [uptr1], #0x10\n"
- "fmla vV12.4s, vU12.4s, vW11.4s\n"
- "ldr qU31, [uptr2], #0x10\n"
- "fmla vV11.4s, vU22.4s, vW22.4s\n"
- "fmla vV12.4s, vU22.4s, vW21.4s\n"
- "fmla vV21.4s, vU22.4s, vW12.4s\n"
- "fmla vV22.4s, vU22.4s, vW11.4s\n"
- "ldr qU41, [uptr3], #0x10\n"
- "fmla vV11.4s, vU32.4s, vW32.4s\n"
- "fmla vV12.4s, vU32.4s, vW31.4s\n"
- "str qV12, [%x[vptr0], %x[uvw_col_stride1]]\n"
- "fmla vV21.4s, vU32.4s, vW22.4s\n"
- "fmla vV22.4s, vU32.4s, vW21.4s\n"
- "fmla vV31.4s, vU32.4s, vW12.4s\n"
- "fmla vV32.4s, vU32.4s, vW11.4s\n"
- "ldr qU51, [uptr4], #0x10\n"
- "fmla vV21.4s, vU42.4s, vW32.4s\n"
- "fmla vV22.4s, vU42.4s, vW31.4s\n"
- "str qV22, [vptr1, %x[uvw_col_stride1]]\n"
- "fmla vV31.4s, vU42.4s, vW22.4s\n"
- "fmla vV32.4s, vU42.4s, vW21.4s\n"
- "subs %x[c4_rem], %x[c4_rem], #1\n"
- "fmla vV41.4s, vU42.4s, vW12.4s\n"
- "fmla vV42.4s, vU42.4s, vW11.4s\n"
- "ldr qU61, [uptr5], #0x10\n"
- "fmla vV31.4s, vU52.4s, vW32.4s\n"
- "fmla vV32.4s, vU52.4s, vW31.4s\n"
- "str qV32, [vptr2, %x[uvw_col_stride1]]\n"
- "fmla vV41.4s, vU52.4s, vW22.4s\n"
- "fmla vV42.4s, vU52.4s, vW21.4s\n"
- "fmla vV41.4s, vU62.4s, vW32.4s\n"
- "fmla vV42.4s, vU62.4s, vW31.4s\n"
- "str qV42, [vptr3, %x[uvw_col_stride1]]\n"
- "fmla vV11.4s, vU11.4s, vW11.4s\n"
- "fmla vV11.4s, vU21.4s, vW21.4s\n"
- "fmla vV21.4s, vU21.4s, vW11.4s\n"
- "fmla vV11.4s, vU31.4s, vW31.4s\n"
- "str qV11, [%x[vptr0]], #0x10\n"
- "fmla vV21.4s, vU31.4s, vW21.4s\n"
- "fmla vV31.4s, vU31.4s, vW11.4s\n"
- "fmla vV21.4s, vU41.4s, vW31.4s\n"
- "str qV21, [vptr1], #0x10\n"
- "fmla vV31.4s, vU41.4s, vW21.4s\n"
- "fmla vV41.4s, vU41.4s, vW11.4s\n"
- "fmla vV31.4s, vU51.4s, vW31.4s\n"
- "str qV31, [vptr2], #0x10\n"
- "fmla vV41.4s, vU51.4s, vW21.4s\n"
- "fmla vV41.4s, vU61.4s, vW31.4s\n"
- "str qV41, [vptr3], #0x10\n"
-
- ".unreq qW22\n" ".unreq qU64\n" ".unreq qU35\n" ".unreq qV41\n"
- ".unreq qU34\n" ".unreq qU21\n" ".unreq qV43\n" ".unreq qW21\n"
- ".unreq qU24\n" ".unreq qU54\n" ".unreq qV31\n" ".unreq qV12\n"
- ".unreq qU61\n" ".unreq qU26\n" ".unreq qV32\n"
- ".unreq qU36\n" ".unreq qU51\n" ".unreq qU66\n" ".unreq qU12\n"
- ".unreq qV14\n" ".unreq qV11\n" ".unreq qU65\n"
- ".unreq qU15\n" ".unreq qU22\n" ".unreq qU45\n"
- ".unreq qV22\n" ".unreq qU14\n"
- ".unreq qU44\n" ".unreq qU43\n" ".unreq qU11\n"
- ".unreq qV24\n" ".unreq qV42\n" ".unreq qW31\n" ".unreq qW13\n"
- ".unreq qU33\n" ".unreq qU62\n" ".unreq qU25\n" ".unreq qU56\n"
- ".unreq qW33\n"
- ".unreq qU42\n" ".unreq qU16\n" ".unreq qV44\n"
- ".unreq qU63\n" ".unreq qU31\n" ".unreq qV34\n"
- ".unreq qW11\n" ".unreq qU41\n" ".unreq qV13\n" ".unreq qV33\n"
- ".unreq qU46\n" ".unreq qU32\n" ".unreq qU13\n"
- ".unreq qW23\n" ".unreq qV23\n" ".unreq qV21\n" ".unreq qU55\n"
- ".unreq qW12\n" ".unreq qW32\n" ".unreq qU23\n" ".unreq qU52\n"
- ".unreq qU53\n" ".unreq vW22\n"
- ".unreq vU64\n" ".unreq vU35\n" ".unreq vV41\n"
- ".unreq vU34\n" ".unreq vU21\n" ".unreq vV43\n" ".unreq vW21\n"
- ".unreq vU24\n" ".unreq vU54\n" ".unreq vV31\n"
- ".unreq vV12\n" ".unreq vU61\n"
- ".unreq vU26\n" ".unreq vV32\n"
- ".unreq vU36\n" ".unreq vU51\n" ".unreq vU66\n" ".unreq vU12\n"
- ".unreq vV14\n" ".unreq vV11\n" ".unreq vU65\n"
- ".unreq vU15\n" ".unreq vU22\n" ".unreq vU45\n"
- ".unreq vV22\n" ".unreq vU14\n"
- ".unreq vU44\n" ".unreq vU43\n" ".unreq vU11\n"
- ".unreq vV24\n" ".unreq vV42\n" ".unreq vW31\n" ".unreq vW13\n"
- ".unreq vU33\n" ".unreq vU62\n" ".unreq vU25\n" ".unreq vU56\n"
- ".unreq vW33\n" ".unreq vU42\n" ".unreq vU16\n" ".unreq vV44\n"
- ".unreq vU63\n" ".unreq vU31\n" ".unreq vV34\n" ".unreq vW11\n"
- ".unreq vU41\n" ".unreq vV13\n" ".unreq vV33\n"
- ".unreq vU46\n" ".unreq vU32\n" ".unreq vU13\n" ".unreq vW23\n"
- ".unreq vV23\n" ".unreq vV21\n" ".unreq vU55\n" ".unreq vW12\n"
- ".unreq vW32\n" ".unreq vU23\n" ".unreq vU52\n" ".unreq vU53\n"
- : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
- [c4_rem] "+r" (c4_rem)
- : [u_row_stride] "r" (in_row_stride * sizeof(float)),
- [v_row_stride] "r" (out_row_stride * sizeof(float)),
- [w_row_stride] "r" (weight_row_stride * sizeof(float)),
- [uvw_col_stride1] "r" (1 * in_col_stride * sizeof(float)),
- [uvw_col_stride2] "r" (2 * in_col_stride * sizeof(float)),
- [uvw_col_stride3] "r" (3 * in_col_stride * sizeof(float)),
- [uvw_col_stride4] "r" (4 * in_col_stride * sizeof(float)),
- [uvw_col_stride5] "r" (5 * in_col_stride * sizeof(float)),
- [prftch] "i" (prefetch_depth * sizeof(float)),
- [prftch_uvw_col_stride1] "r" ((prefetch_depth + 1 * in_col_stride) * sizeof(float)),
- [prftch_uvw_col_stride2] "r" ((prefetch_depth + 2 * in_col_stride) * sizeof(float)),
- [prftch_uvw_col_stride3] "r" ((prefetch_depth + 3 * in_col_stride) * sizeof(float)),
- [prftch_uvw_col_stride4] "r" ((prefetch_depth + 4 * in_col_stride) * sizeof(float)),
- [prftch_uvw_col_stride5] "r" ((prefetch_depth + 5 * in_col_stride) * sizeof(float))
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
- "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0",
- "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- }
- else if (channels_remaining >= 4)
- {
- int c4_rem = channels_remaining / 4;
- channels_remaining %= 4;
-
- asm volatile (
- "qW22 .req q0\n" "vW22 .req v0\n"
- "qU64 .req q1\n" "qU35 .req q1\n" "qV41 .req q1\n"
- "vU64 .req v1\n" "vU35 .req v1\n" "vV41 .req v1\n"
- "qU34 .req q2\n" "qU21 .req q2\n" "qV43 .req q2\n"
- "vU34 .req v2\n" "vU21 .req v2\n" "vV43 .req v2\n"
- "qW21 .req q3\n" "vW21 .req v3\n"
- "qU24 .req q4\n" "qU54 .req q4\n" "qV31 .req q4\n"
- "vU24 .req v4\n" "vU54 .req v4\n" "vV31 .req v4\n"
- "qV12 .req q5\n" "qU61 .req q5\n" "vV12 .req v5\n" "vU61 .req v5\n"
- "qU26 .req q6\n" "qV32 .req q6\n" "vU26 .req v6\n" "vV32 .req v6\n"
- "qU36 .req q7\n" "qU51 .req q7\n" "qU66 .req q7\n" "qU12 .req q7\n"
- "vU36 .req v7\n" "vU51 .req v7\n" "vU66 .req v7\n" "vU12 .req v7\n"
- "qV14 .req q8\n" "qV11 .req q8\n" "qU65 .req q8\n"
- "vV14 .req v8\n" "vV11 .req v8\n" "vU65 .req v8\n"
- "qU15 .req q9\n" "qU22 .req q9\n" "qU45 .req q9\n"
- "vU15 .req v9\n" "vU22 .req v9\n" "vU45 .req v9\n"
- "qV22 .req q10\n" "qU14 .req q10\n" "vV22 .req v10\n" "vU14 .req v10\n"
- "qU44 .req q11\n" "qU43 .req q11\n" "qU11 .req q11\n"
- "vU44 .req v11\n" "vU43 .req v11\n" "vU11 .req v11\n"
- "qV24 .req q12\n" "qV42 .req q12\n" "vV24 .req v12\n" "vV42 .req v12\n"
- "qW31 .req q13\n" "vW31 .req v13\n" "qW13 .req q14\n" "vW13 .req v14\n"
- "qU33 .req q15\n" "qU62 .req q15\n" "qU25 .req q15\n" "qU56 .req q15\n"
- "vU33 .req v15\n" "vU62 .req v15\n" "vU25 .req v15\n" "vU56 .req v15\n"
- "qW33 .req q16\n" "vW33 .req v16\n"
- "qU42 .req q17\n" "qU16 .req q17\n" "qV44 .req q17\n"
- "vU42 .req v17\n" "vU16 .req v17\n" "vV44 .req v17\n"
- "qU63 .req q18\n" "qU31 .req q18\n" "qV34 .req q18\n"
- "vU63 .req v18\n" "vU31 .req v18\n" "vV34 .req v18\n"
- "qW11 .req q19\n" "vW11 .req v19\n" "qU41 .req q20\n" "qV13 .req q20\n"
- "vU41 .req v20\n" "vV13 .req v20\n" "qV33 .req q21\n" "vV33 .req v21\n"
- "qU46 .req q22\n" "qU32 .req q22\n" "qU13 .req q22\n"
- "vU46 .req v22\n" "vU32 .req v22\n" "vU13 .req v22\n" "qW23 .req q23\n"
- "vW23 .req v23\n" "qV23 .req q24\n" "vV23 .req v24\n"
- "qV21 .req q25\n" "qU55 .req q25\n" "vV21 .req v25\n" "vU55 .req v25\n"
- "qW12 .req q26\n" "vW12 .req v26\n" "qW32 .req q27\n" "vW32 .req v27\n"
- "qU23 .req q28\n" "qU52 .req q28\n"
- "vU23 .req v28\n" "vU52 .req v28\n" "qU53 .req q29\n" "vU53 .req v29\n"
-
- "uptr1 .req x0\n"
- "uptr2 .req x1\n"
- "uptr3 .req x2\n"
- "uptr4 .req x3\n"
- "uptr5 .req x4\n"
-
- "vptr1 .req x5\n"
- "vptr2 .req x6\n"
- "vptr3 .req x7\n"
-
- "wptr1 .req x8\n"
- "wptr2 .req x9\n"
-
- "u_col_stride2 .req x10\n"
- "u_col_stride3 .req x11\n"
- "u_col_stride4 .req x12\n"
- "u_col_stride5 .req x13\n"
-
- "v_col_stride2 .req x14\n"
- "v_col_stride3 .req x15\n"
-
- "w_col_stride2 .req x16\n"
-
- // Prepare pointers and strides
- "add uptr1, %x[uptr0], %x[u_row_stride]\n"
- "add uptr2, uptr1 , %x[u_row_stride]\n"
- "add uptr3, uptr2 , %x[u_row_stride]\n"
- "add uptr4, uptr3 , %x[u_row_stride]\n"
- "add uptr5, uptr4 , %x[u_row_stride]\n"
-
- "add vptr1, %x[vptr0], %x[v_row_stride]\n"
- "add vptr2, vptr1 , %x[v_row_stride]\n"
- "add vptr3, vptr2 , %x[v_row_stride]\n"
-
- "add wptr1, %x[wptr0], %x[w_row_stride]\n"
- "add wptr2, wptr1 , %x[w_row_stride]\n"
-
- "add u_col_stride2, %x[u_col_stride1], %x[u_col_stride1]\n"
- "add u_col_stride3, u_col_stride2 , %x[u_col_stride1]\n"
- "add u_col_stride4, u_col_stride3 , %x[u_col_stride1]\n"
- "add u_col_stride5, u_col_stride4 , %x[u_col_stride1]\n"
-
- "add v_col_stride2, %x[v_col_stride1], %x[v_col_stride1]\n"
- "add v_col_stride3, v_col_stride2 , %x[v_col_stride1]\n"
-
- "add w_col_stride2, %x[w_col_stride1], %x[w_col_stride1]\n"
-
- // Load initial operands
- "ldr qU16, [%x[uptr0], u_col_stride5]\n"
- "ldr qW13, [%x[wptr0], w_col_stride2]\n"
- "subs %x[c4_rem], %x[c4_rem], #1\n"
- "ldr qU15, [%x[uptr0], u_col_stride4]\n"
- "ldr qW23, [wptr1, w_col_stride2]\n"
- "ldr qU14, [%x[uptr0], u_col_stride3]\n"
- "ldr qW33, [wptr2, w_col_stride2]\n"
- "ldr qU26, [uptr1, u_col_stride5]\n"
- "ldr qW12, [%x[wptr0], %x[w_col_stride1]]\n"
- "ldr qU25, [uptr1, u_col_stride4]\n"
- "ldr qW22, [wptr1, %x[w_col_stride1]]\n"
- "ldr qU36, [uptr2, u_col_stride5]\n"
- "ldr qW32, [wptr2, %x[w_col_stride1]]\n"
- "ldr qW11, [%x[wptr0]], #0x10\n"
- "fmul vV14.4s, vU16.4s, vW13.4s\n"
- "ldr qU24, [uptr1, u_col_stride3]\n"
- "fmul vV13.4s, vU15.4s, vW13.4s\n"
- "ldr qW31, [wptr2], #0x10\n"
- "fmla vV14.4s, vU15.4s, vW12.4s\n"
- "ldr qW21, [wptr1], #0x10\n"
- "fmul vV12.4s, vU14.4s, vW13.4s\n"
- "ldr qU34, [uptr2, u_col_stride3]\n"
- "fmla vV13.4s, vU14.4s, vW12.4s\n"
- "ldr qU46, [uptr3, u_col_stride5]\n"
- "fmla vV14.4s, vU14.4s, vW11.4s\n"
- "ldr qU45, [uptr3, u_col_stride4]\n"
- "fmla vV14.4s, vU26.4s, vW23.4s\n"
- "ldr qU35, [uptr2, u_col_stride4]\n"
- "fmul vV24.4s, vU26.4s, vW13.4s\n"
- "ldr qU44, [uptr3, u_col_stride3]\n"
- "fmla vV13.4s, vU25.4s, vW23.4s\n"
- "beq 2f\n" // Single iteration only
-
- "1:" // Loop body
- "fmla vV14.4s, vU25.4s, vW22.4s\n"
- "prfm pldl1keep, [%x[wptr0]]\n"
- "fmul vV23.4s, vU25.4s, vW13.4s\n"
- "prfm pldl1keep, [%x[wptr0], %x[w_col_stride1]]\n"
- "fmla vV24.4s, vU25.4s, vW12.4s\n"
- "ldr qU56, [uptr4, u_col_stride5]\n"
- "fmla vV12.4s, vU24.4s, vW23.4s\n"
- "prfm pldl1keep, [%x[wptr0], w_col_stride2 ]\n"
- "fmla vV13.4s, vU24.4s, vW22.4s\n"
- "prfm pldl1keep, [ wptr1 ]\n"
- "fmla vV14.4s, vU24.4s, vW21.4s\n"
- "prfm pldl1keep, [ wptr1 , %x[w_col_stride1]]\n"
- "fmul vV22.4s, vU24.4s, vW13.4s\n"
- "prfm pldl1keep, [ wptr1 , w_col_stride2 ]\n"
- "fmla vV23.4s, vU24.4s, vW12.4s\n"
- "prfm pldl1keep, [ wptr2 ]\n"
- "fmla vV24.4s, vU24.4s, vW11.4s\n"
- "ldr qU55, [uptr4, u_col_stride4]\n"
- "fmla vV14.4s, vU36.4s, vW33.4s\n"
- "prfm pldl1keep, [ wptr2 , %x[w_col_stride1]]\n"
- "fmla vV24.4s, vU36.4s, vW23.4s\n"
- "prfm pldl1keep, [ wptr2 , w_col_stride2 ]\n"
- "fmul vV34.4s, vU36.4s, vW13.4s\n"
- "ldr qU54, [uptr4, u_col_stride3]\n"
- "fmla vV13.4s, vU35.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr2 , %x[u_col_stride1]]\n"
- "fmla vV14.4s, vU35.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr2 , u_col_stride2 ]\n"
- "fmla vV23.4s, vU35.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr2 , u_col_stride3 ]\n"
- "fmla vV24.4s, vU35.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr2 , u_col_stride4 ]\n"
- "fmul vV33.4s, vU35.4s, vW13.4s\n"
- "prfm pldl1keep, [ uptr2 , u_col_stride5 ]\n"
- "fmla vV34.4s, vU35.4s, vW12.4s\n"
- "ldr qU66, [uptr5, u_col_stride5]\n"
- "fmla vV12.4s, vU34.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr3 ]\n"
- "fmla vV13.4s, vU34.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr3 , %x[u_col_stride1]]\n"
- "fmla vV14.4s, vU34.4s, vW31.4s\n"
- "str qV14, [%x[vptr0], v_col_stride3]\n"
- "fmla vV22.4s, vU34.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr3 , u_col_stride2 ]\n"
- "fmla vV23.4s, vU34.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr3 , u_col_stride3 ]\n"
- "fmla vV24.4s, vU34.4s, vW21.4s\n"
- "prfm pldl1keep, [ uptr3 , u_col_stride4 ]\n"
- "fmul vV32.4s, vU34.4s, vW13.4s\n"
- "prfm pldl1keep, [ uptr3 , u_col_stride5 ]\n"
- "fmla vV33.4s, vU34.4s, vW12.4s\n"
- "prfm pldl1keep, [ uptr4 ]\n"
- "fmla vV34.4s, vU34.4s, vW11.4s\n"
- "ldr qU65, [uptr5, u_col_stride4]\n"
- "fmla vV24.4s, vU46.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr4 , %x[u_col_stride1]]\n"
- "fmla vV34.4s, vU46.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr4 , u_col_stride2 ]\n"
- "fmul vV44.4s, vU46.4s, vW13.4s\n"
- "ldr qU64, [uptr5, u_col_stride3]\n"
- "fmla vV23.4s, vU45.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr4 , u_col_stride3 ]\n"
- "fmla vV24.4s, vU45.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr4 , u_col_stride4 ]\n"
- "fmla vV33.4s, vU45.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr4 , u_col_stride5 ]\n"
- "fmla vV34.4s, vU45.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr5 ]\n"
- "fmul vV43.4s, vU45.4s, vW13.4s\n"
- "prfm pldl1keep, [ uptr5 , %x[u_col_stride1]]\n"
- "fmla vV44.4s, vU45.4s, vW12.4s\n"
- "ldr qU13, [%x[uptr0], u_col_stride2]\n"
- "fmla vV22.4s, vU44.4s, vW33.4s\n"
- "prfm pldl1keep, [ uptr5 , u_col_stride2 ]\n"
- "fmla vV23.4s, vU44.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr5 , u_col_stride3 ]\n"
- "fmla vV24.4s, vU44.4s, vW31.4s\n"
- "str qV24, [vptr1, v_col_stride3]\n"
- "fmla vV32.4s, vU44.4s, vW23.4s\n"
- "prfm pldl1keep, [ uptr5 , u_col_stride4 ]\n"
- "fmla vV33.4s, vU44.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr5 , u_col_stride5 ]\n"
- "fmla vV34.4s, vU44.4s, vW21.4s\n"
- "prfm pstl1keep, [%x[vptr0]]\n"
- "fmul vV42.4s, vU44.4s, vW13.4s\n"
- "prfm pstl1keep, [%x[vptr0], %x[v_col_stride1]]\n"
- "fmla vV43.4s, vU44.4s, vW12.4s\n"
- "prfm pstl1keep, [%x[vptr0], v_col_stride2 ]\n"
- "fmla vV44.4s, vU44.4s, vW11.4s\n"
- "ldr qU23, [uptr1, u_col_stride2]\n"
- "fmla vV34.4s, vU56.4s, vW33.4s\n"
- "prfm pstl1keep, [%x[vptr0], v_col_stride3 ]\n"
- "fmla vV44.4s, vU56.4s, vW23.4s\n"
- "ldr qU33, [uptr2, u_col_stride2]\n"
- "fmla vV33.4s, vU55.4s, vW33.4s\n"
- "prfm pstl1keep, [ vptr1 ]\n"
- "fmla vV34.4s, vU55.4s, vW32.4s\n"
- "prfm pstl1keep, [ vptr1 , %x[v_col_stride1]]\n"
- "fmla vV43.4s, vU55.4s, vW23.4s\n"
- "prfm pstl1keep, [ vptr1 , v_col_stride2 ]\n"
- "fmla vV44.4s, vU55.4s, vW22.4s\n"
- "ldr qU43, [uptr3, u_col_stride2]\n"
- "fmla vV32.4s, vU54.4s, vW33.4s\n"
- "prfm pstl1keep, [ vptr1 , v_col_stride3 ]\n"
- "fmla vV33.4s, vU54.4s, vW32.4s\n"
- "prfm pstl1keep, [ vptr2 ]\n"
- "fmla vV34.4s, vU54.4s, vW31.4s\n"
- "str qV34, [vptr2, v_col_stride3]\n"
- "fmla vV42.4s, vU54.4s, vW23.4s\n"
- "prfm pstl1keep, [ vptr2 , %x[v_col_stride1]]\n"
- "fmla vV43.4s, vU54.4s, vW22.4s\n"
- "prfm pstl1keep, [ vptr2 , v_col_stride2 ]\n"
- "fmla vV44.4s, vU54.4s, vW21.4s\n"
- "ldr qU53, [uptr4, u_col_stride2]\n"
- "fmla vV44.4s, vU66.4s, vW33.4s\n"
- "ldr qU63, [uptr5, u_col_stride2]\n"
- "fmla vV43.4s, vU65.4s, vW33.4s\n"
- "prfm pstl1keep, [ vptr2 , v_col_stride3 ]\n"
- "fmla vV44.4s, vU65.4s, vW32.4s\n"
- "ldr qU12, [%x[uptr0], %x[u_col_stride1]]\n"
- "fmla vV42.4s, vU64.4s, vW33.4s\n"
- "prfm pstl1keep, [ vptr3 ]\n"
- "fmla vV43.4s, vU64.4s, vW32.4s\n"
- "prfm pstl1keep, [ vptr3 , %x[v_col_stride1]]\n"
- "fmla vV44.4s, vU64.4s, vW31.4s\n"
- "str qV44, [vptr3, v_col_stride3]\n"
- "fmul vV11.4s, vU13.4s, vW13.4s\n"
- "ldr qU22, [uptr1, %x[u_col_stride1]]\n"
- "fmla vV12.4s, vU13.4s, vW12.4s\n"
- "prfm pstl1keep, [ vptr3 , v_col_stride2 ]\n"
- "fmla vV13.4s, vU13.4s, vW11.4s\n"
- "ldr qU32, [uptr2, %x[u_col_stride1]]\n"
- "fmla vV11.4s, vU23.4s, vW23.4s\n"
- "prfm pstl1keep, [ vptr3 , v_col_stride3 ]\n"
- "fmla vV12.4s, vU23.4s, vW22.4s\n"
- "fmla vV13.4s, vU23.4s, vW21.4s\n"
- "fmul vV21.4s, vU23.4s, vW13.4s\n"
- "fmla vV22.4s, vU23.4s, vW12.4s\n"
- "fmla vV23.4s, vU23.4s, vW11.4s\n"
- "ldr qU42, [uptr3, %x[u_col_stride1]]\n"
- "fmla vV11.4s, vU33.4s, vW33.4s\n"
- "fmla vV12.4s, vU33.4s, vW32.4s\n"
- "fmla vV13.4s, vU33.4s, vW31.4s\n"
- "str qV13, [%x[vptr0], v_col_stride2]\n"
- "fmla vV21.4s, vU33.4s, vW23.4s\n"
- "fmla vV22.4s, vU33.4s, vW22.4s\n"
- "fmla vV23.4s, vU33.4s, vW21.4s\n"
- "fmul vV31.4s, vU33.4s, vW13.4s\n"
- "fmla vV32.4s, vU33.4s, vW12.4s\n"
- "fmla vV33.4s, vU33.4s, vW11.4s\n"
- "ldr qU52, [uptr4, %x[u_col_stride1]]\n"
- "fmla vV21.4s, vU43.4s, vW33.4s\n"
- "fmla vV22.4s, vU43.4s, vW32.4s\n"
- "fmla vV23.4s, vU43.4s, vW31.4s\n"
- "str qV23, [vptr1, v_col_stride2]\n"
- "fmla vV31.4s, vU43.4s, vW23.4s\n"
- "fmla vV32.4s, vU43.4s, vW22.4s\n"
- "fmla vV33.4s, vU43.4s, vW21.4s\n"
- "fmul vV41.4s, vU43.4s, vW13.4s\n"
- "ldr qW13, [%x[wptr0], w_col_stride2]\n"
- "fmla vV42.4s, vU43.4s, vW12.4s\n"
- "fmla vV43.4s, vU43.4s, vW11.4s\n"
- "ldr qU62, [uptr5, %x[u_col_stride1]]\n"
- "fmla vV31.4s, vU53.4s, vW33.4s\n"
- "fmla vV32.4s, vU53.4s, vW32.4s\n"
- "fmla vV33.4s, vU53.4s, vW31.4s\n"
- "str qV33, [vptr2, v_col_stride2]\n"
- "fmla vV41.4s, vU53.4s, vW23.4s\n"
- "ldr qW23, [wptr1, w_col_stride2]\n"
- "fmla vV42.4s, vU53.4s, vW22.4s\n"
- "fmla vV43.4s, vU53.4s, vW21.4s\n"
- "ldr qU11, [%x[uptr0]], #0x10\n"
- "fmla vV41.4s, vU63.4s, vW33.4s\n"
- "ldr qW33, [wptr2, w_col_stride2]\n"
- "fmla vV42.4s, vU63.4s, vW32.4s\n"
- "prfm pldl1keep, [%x[uptr0]]\n"
- "fmla vV43.4s, vU63.4s, vW31.4s\n"
- "str qV43, [vptr3, v_col_stride2]\n"
- "fmla vV11.4s, vU12.4s, vW12.4s\n"
- "ldr qU21, [uptr1], #0x10\n"
- "fmla vV12.4s, vU12.4s, vW11.4s\n"
- "ldr qU31, [uptr2], #0x10\n"
- "fmla vV11.4s, vU22.4s, vW22.4s\n"
- "prfm pldl1keep, [%x[uptr0], %x[u_col_stride1]]\n"
- "fmla vV12.4s, vU22.4s, vW21.4s\n"
- "prfm pldl1keep, [%x[uptr0], u_col_stride2 ]\n"
- "fmla vV21.4s, vU22.4s, vW12.4s\n"
- "prfm pldl1keep, [%x[uptr0], u_col_stride3 ]\n"
- "fmla vV22.4s, vU22.4s, vW11.4s\n"
- "ldr qU41, [uptr3], #0x10\n"
- "fmla vV11.4s, vU32.4s, vW32.4s\n"
- "prfm pldl1keep, [%x[uptr0], u_col_stride4 ]\n"
- "fmla vV12.4s, vU32.4s, vW31.4s\n"
- "str qV12, [%x[vptr0], %x[v_col_stride1]]\n"
- "fmla vV21.4s, vU32.4s, vW22.4s\n"
- "prfm pldl1keep, [%x[uptr0], u_col_stride5 ]\n"
- "fmla vV22.4s, vU32.4s, vW21.4s\n"
- "prfm pldl1keep, [ uptr1 ]\n"
- "fmla vV31.4s, vU32.4s, vW12.4s\n"
- "prfm pldl1keep, [ uptr1 , %x[u_col_stride1]]\n"
- "fmla vV32.4s, vU32.4s, vW11.4s\n"
- "ldr qU51, [uptr4], #0x10\n"
- "fmla vV21.4s, vU42.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr1 , u_col_stride2 ]\n"
- "fmla vV22.4s, vU42.4s, vW31.4s\n"
- "str qV22, [vptr1, %x[v_col_stride1]]\n"
- "fmla vV31.4s, vU42.4s, vW22.4s\n"
- "prfm pldl1keep, [ uptr1 , u_col_stride3 ]\n"
- "fmla vV32.4s, vU42.4s, vW21.4s\n"
- "subs %x[c4_rem], %x[c4_rem], #1\n"
- "fmla vV41.4s, vU42.4s, vW12.4s\n"
- "ldr qW12, [%x[wptr0], %x[w_col_stride1]]\n"
- "fmla vV42.4s, vU42.4s, vW11.4s\n"
- "ldr qU61, [uptr5], #0x10\n"
- "fmla vV31.4s, vU52.4s, vW32.4s\n"
- "prfm pldl1keep, [ uptr1 , u_col_stride4 ]\n"
- "fmla vV32.4s, vU52.4s, vW31.4s\n"
- "str qV32, [vptr2, %x[v_col_stride1]]\n"
- "fmla vV41.4s, vU52.4s, vW22.4s\n"
- "ldr qW22, [wptr1, %x[w_col_stride1]]\n"
- "fmla vV42.4s, vU52.4s, vW21.4s\n"
- "ldr qU16, [%x[uptr0], u_col_stride5]\n"
- "fmla vV41.4s, vU62.4s, vW32.4s\n"
- "ldr qW32, [wptr2, %x[w_col_stride1]]\n"
- "fmla vV42.4s, vU62.4s, vW31.4s\n"
- "str qV42, [vptr3, %x[v_col_stride1]]\n"
- "fmla vV11.4s, vU11.4s, vW11.4s\n"
- "ldr qU15, [%x[uptr0], u_col_stride4]\n"
- "fmla vV11.4s, vU21.4s, vW21.4s\n"
- "ldr qU14, [%x[uptr0], u_col_stride3]\n"
- "fmla vV21.4s, vU21.4s, vW11.4s\n"
- "ldr qU26, [uptr1, u_col_stride5]\n"
- "fmla vV11.4s, vU31.4s, vW31.4s\n"
- "str qV11, [%x[vptr0]], #0x10\n"
- "fmla vV21.4s, vU31.4s, vW21.4s\n"
- "prfm pldl1keep, [ uptr1 , u_col_stride5 ]\n"
- "fmla vV31.4s, vU31.4s, vW11.4s\n"
- "ldr qU25, [uptr1, u_col_stride4]\n"
- "fmla vV21.4s, vU41.4s, vW31.4s\n"
- "str qV21, [vptr1], #0x10\n"
- "fmla vV31.4s, vU41.4s, vW21.4s\n"
- "prfm pldl1keep, [ uptr2 ]\n"
- "fmla vV41.4s, vU41.4s, vW11.4s\n"
- "ldr qW11, [%x[wptr0]], #0x10\n"
- "fmla vV31.4s, vU51.4s, vW31.4s\n"
- "str qV31, [vptr2], #0x10\n"
- "fmla vV41.4s, vU51.4s, vW21.4s\n"
- "ldr qU36, [uptr2, u_col_stride5]\n"
- "fmla vV41.4s, vU61.4s, vW31.4s\n"
- "str qV41, [vptr3], #0x10\n"
- "fmul vV14.4s, vU16.4s, vW13.4s\n"
- "ldr qU24, [uptr1, u_col_stride3]\n"
- "fmul vV13.4s, vU15.4s, vW13.4s\n"
- "ldr qW31, [wptr2], #0x10\n"
- "fmla vV14.4s, vU15.4s, vW12.4s\n"
- "ldr qW21, [wptr1], #0x10\n"
- "fmul vV12.4s, vU14.4s, vW13.4s\n"
- "ldr qU34, [uptr2, u_col_stride3]\n"
- "fmla vV13.4s, vU14.4s, vW12.4s\n"
- "ldr qU46, [uptr3, u_col_stride5]\n"
- "fmla vV14.4s, vU14.4s, vW11.4s\n"
- "ldr qU45, [uptr3, u_col_stride4]\n"
- "fmla vV14.4s, vU26.4s, vW23.4s\n"
- "ldr qU35, [uptr2, u_col_stride4]\n"
- "fmul vV24.4s, vU26.4s, vW13.4s\n"
- "ldr qU44, [uptr3, u_col_stride3]\n"
- "fmla vV13.4s, vU25.4s, vW23.4s\n"
- "bne 1b\n"
-
- "2:" // Final iteration
- "fmla vV14.4s, vU25.4s, vW22.4s\n"
- "fmul vV23.4s, vU25.4s, vW13.4s\n"
- "fmla vV24.4s, vU25.4s, vW12.4s\n"
- "ldr qU56, [uptr4, u_col_stride5]\n"
- "fmla vV12.4s, vU24.4s, vW23.4s\n"
- "fmla vV13.4s, vU24.4s, vW22.4s\n"
- "fmla vV14.4s, vU24.4s, vW21.4s\n"
- "fmul vV22.4s, vU24.4s, vW13.4s\n"
- "fmla vV23.4s, vU24.4s, vW12.4s\n"
- "fmla vV24.4s, vU24.4s, vW11.4s\n"
- "ldr qU55, [uptr4, u_col_stride4]\n"
- "fmla vV14.4s, vU36.4s, vW33.4s\n"
- "fmla vV24.4s, vU36.4s, vW23.4s\n"
- "fmul vV34.4s, vU36.4s, vW13.4s\n"
- "ldr qU54, [uptr4, u_col_stride3]\n"
- "fmla vV13.4s, vU35.4s, vW33.4s\n"
- "fmla vV14.4s, vU35.4s, vW32.4s\n"
- "fmla vV23.4s, vU35.4s, vW23.4s\n"
- "fmla vV24.4s, vU35.4s, vW22.4s\n"
- "fmul vV33.4s, vU35.4s, vW13.4s\n"
- "fmla vV34.4s, vU35.4s, vW12.4s\n"
- "ldr qU66, [uptr5, u_col_stride5]\n"
- "fmla vV12.4s, vU34.4s, vW33.4s\n"
- "fmla vV13.4s, vU34.4s, vW32.4s\n"
- "fmla vV14.4s, vU34.4s, vW31.4s\n"
- "str qV14, [%x[vptr0], v_col_stride3]\n"
- "fmla vV22.4s, vU34.4s, vW23.4s\n"
- "fmla vV23.4s, vU34.4s, vW22.4s\n"
- "fmla vV24.4s, vU34.4s, vW21.4s\n"
- "fmul vV32.4s, vU34.4s, vW13.4s\n"
- "fmla vV33.4s, vU34.4s, vW12.4s\n"
- "fmla vV34.4s, vU34.4s, vW11.4s\n"
- "ldr qU65, [uptr5, u_col_stride4]\n"
- "fmla vV24.4s, vU46.4s, vW33.4s\n"
- "fmla vV34.4s, vU46.4s, vW23.4s\n"
- "fmul vV44.4s, vU46.4s, vW13.4s\n"
- "ldr qU64, [uptr5, u_col_stride3]\n"
- "fmla vV23.4s, vU45.4s, vW33.4s\n"
- "fmla vV24.4s, vU45.4s, vW32.4s\n"
- "fmla vV33.4s, vU45.4s, vW23.4s\n"
- "fmla vV34.4s, vU45.4s, vW22.4s\n"
- "fmul vV43.4s, vU45.4s, vW13.4s\n"
- "fmla vV44.4s, vU45.4s, vW12.4s\n"
- "ldr qU13, [%x[uptr0], u_col_stride2]\n"
- "fmla vV22.4s, vU44.4s, vW33.4s\n"
- "fmla vV23.4s, vU44.4s, vW32.4s\n"
- "fmla vV24.4s, vU44.4s, vW31.4s\n"
- "str qV24, [vptr1, v_col_stride3]\n"
- "fmla vV32.4s, vU44.4s, vW23.4s\n"
- "fmla vV33.4s, vU44.4s, vW22.4s\n"
- "fmla vV34.4s, vU44.4s, vW21.4s\n"
- "fmul vV42.4s, vU44.4s, vW13.4s\n"
- "fmla vV43.4s, vU44.4s, vW12.4s\n"
- "fmla vV44.4s, vU44.4s, vW11.4s\n"
- "ldr qU23, [uptr1, u_col_stride2]\n"
- "fmla vV34.4s, vU56.4s, vW33.4s\n"
- "fmla vV44.4s, vU56.4s, vW23.4s\n"
- "ldr qU33, [uptr2, u_col_stride2]\n"
- "fmla vV33.4s, vU55.4s, vW33.4s\n"
- "fmla vV34.4s, vU55.4s, vW32.4s\n"
- "fmla vV43.4s, vU55.4s, vW23.4s\n"
- "fmla vV44.4s, vU55.4s, vW22.4s\n"
- "ldr qU43, [uptr3, u_col_stride2]\n"
- "fmla vV32.4s, vU54.4s, vW33.4s\n"
- "fmla vV33.4s, vU54.4s, vW32.4s\n"
- "fmla vV34.4s, vU54.4s, vW31.4s\n"
- "str qV34, [vptr2, v_col_stride3]\n"
- "fmla vV42.4s, vU54.4s, vW23.4s\n"
- "fmla vV43.4s, vU54.4s, vW22.4s\n"
- "fmla vV44.4s, vU54.4s, vW21.4s\n"
- "ldr qU53, [uptr4, u_col_stride2]\n"
- "fmla vV44.4s, vU66.4s, vW33.4s\n"
- "ldr qU63, [uptr5, u_col_stride2]\n"
- "fmla vV43.4s, vU65.4s, vW33.4s\n"
- "fmla vV44.4s, vU65.4s, vW32.4s\n"
- "ldr qU12, [%x[uptr0], %x[u_col_stride1]]\n"
- "fmla vV42.4s, vU64.4s, vW33.4s\n"
- "fmla vV43.4s, vU64.4s, vW32.4s\n"
- "fmla vV44.4s, vU64.4s, vW31.4s\n"
- "str qV44, [vptr3, v_col_stride3]\n"
- "fmul vV11.4s, vU13.4s, vW13.4s\n"
- "ldr qU22, [uptr1, %x[u_col_stride1]]\n"
- "fmla vV12.4s, vU13.4s, vW12.4s\n"
- "fmla vV13.4s, vU13.4s, vW11.4s\n"
- "ldr qU32, [uptr2, %x[u_col_stride1]]\n"
- "fmla vV11.4s, vU23.4s, vW23.4s\n"
- "fmla vV12.4s, vU23.4s, vW22.4s\n"
- "fmla vV13.4s, vU23.4s, vW21.4s\n"
- "fmul vV21.4s, vU23.4s, vW13.4s\n"
- "fmla vV22.4s, vU23.4s, vW12.4s\n"
- "fmla vV23.4s, vU23.4s, vW11.4s\n"
- "ldr qU42, [uptr3, %x[u_col_stride1]]\n"
- "fmla vV11.4s, vU33.4s, vW33.4s\n"
- "fmla vV12.4s, vU33.4s, vW32.4s\n"
- "fmla vV13.4s, vU33.4s, vW31.4s\n"
- "str qV13, [%x[vptr0], v_col_stride2]\n"
- "fmla vV21.4s, vU33.4s, vW23.4s\n"
- "fmla vV22.4s, vU33.4s, vW22.4s\n"
- "fmla vV23.4s, vU33.4s, vW21.4s\n"
- "fmul vV31.4s, vU33.4s, vW13.4s\n"
- "fmla vV32.4s, vU33.4s, vW12.4s\n"
- "fmla vV33.4s, vU33.4s, vW11.4s\n"
- "ldr qU52, [uptr4, %x[u_col_stride1]]\n"
- "fmla vV21.4s, vU43.4s, vW33.4s\n"
- "fmla vV22.4s, vU43.4s, vW32.4s\n"
- "fmla vV23.4s, vU43.4s, vW31.4s\n"
- "str qV23, [vptr1, v_col_stride2]\n"
- "fmla vV31.4s, vU43.4s, vW23.4s\n"
- "fmla vV32.4s, vU43.4s, vW22.4s\n"
- "fmla vV33.4s, vU43.4s, vW21.4s\n"
- "fmul vV41.4s, vU43.4s, vW13.4s\n"
- "fmla vV42.4s, vU43.4s, vW12.4s\n"
- "fmla vV43.4s, vU43.4s, vW11.4s\n"
- "ldr qU62, [uptr5, %x[u_col_stride1]]\n"
- "fmla vV31.4s, vU53.4s, vW33.4s\n"
- "fmla vV32.4s, vU53.4s, vW32.4s\n"
- "fmla vV33.4s, vU53.4s, vW31.4s\n"
- "str qV33, [vptr2, v_col_stride2]\n"
- "fmla vV41.4s, vU53.4s, vW23.4s\n"
- "fmla vV42.4s, vU53.4s, vW22.4s\n"
- "fmla vV43.4s, vU53.4s, vW21.4s\n"
- "ldr qU11, [%x[uptr0]], #0x10\n"
- "fmla vV41.4s, vU63.4s, vW33.4s\n"
- "fmla vV42.4s, vU63.4s, vW32.4s\n"
- "fmla vV43.4s, vU63.4s, vW31.4s\n"
- "str qV43, [vptr3, v_col_stride2]\n"
- "fmla vV11.4s, vU12.4s, vW12.4s\n"
- "ldr qU21, [uptr1], #0x10\n"
- "fmla vV12.4s, vU12.4s, vW11.4s\n"
- "ldr qU31, [uptr2], #0x10\n"
- "fmla vV11.4s, vU22.4s, vW22.4s\n"
- "fmla vV12.4s, vU22.4s, vW21.4s\n"
- "fmla vV21.4s, vU22.4s, vW12.4s\n"
- "fmla vV22.4s, vU22.4s, vW11.4s\n"
- "ldr qU41, [uptr3], #0x10\n"
- "fmla vV11.4s, vU32.4s, vW32.4s\n"
- "fmla vV12.4s, vU32.4s, vW31.4s\n"
- "str qV12, [%x[vptr0], %x[v_col_stride1]]\n"
- "fmla vV21.4s, vU32.4s, vW22.4s\n"
- "fmla vV22.4s, vU32.4s, vW21.4s\n"
- "fmla vV31.4s, vU32.4s, vW12.4s\n"
- "fmla vV32.4s, vU32.4s, vW11.4s\n"
- "ldr qU51, [uptr4], #0x10\n"
- "fmla vV21.4s, vU42.4s, vW32.4s\n"
- "fmla vV22.4s, vU42.4s, vW31.4s\n"
- "str qV22, [vptr1, %x[v_col_stride1]]\n"
- "fmla vV31.4s, vU42.4s, vW22.4s\n"
- "fmla vV32.4s, vU42.4s, vW21.4s\n"
- "subs %x[c4_rem], %x[c4_rem], #1\n"
- "fmla vV41.4s, vU42.4s, vW12.4s\n"
- "fmla vV42.4s, vU42.4s, vW11.4s\n"
- "ldr qU61, [uptr5], #0x10\n"
- "fmla vV31.4s, vU52.4s, vW32.4s\n"
- "fmla vV32.4s, vU52.4s, vW31.4s\n"
- "str qV32, [vptr2, %x[v_col_stride1]]\n"
- "fmla vV41.4s, vU52.4s, vW22.4s\n"
- "fmla vV42.4s, vU52.4s, vW21.4s\n"
- "fmla vV41.4s, vU62.4s, vW32.4s\n"
- "fmla vV42.4s, vU62.4s, vW31.4s\n"
- "str qV42, [vptr3, %x[v_col_stride1]]\n"
- "fmla vV11.4s, vU11.4s, vW11.4s\n"
- "fmla vV11.4s, vU21.4s, vW21.4s\n"
- "fmla vV21.4s, vU21.4s, vW11.4s\n"
- "fmla vV11.4s, vU31.4s, vW31.4s\n"
- "str qV11, [%x[vptr0]], #0x10\n"
- "fmla vV21.4s, vU31.4s, vW21.4s\n"
- "fmla vV31.4s, vU31.4s, vW11.4s\n"
- "fmla vV21.4s, vU41.4s, vW31.4s\n"
- "str qV21, [vptr1], #0x10\n"
- "fmla vV31.4s, vU41.4s, vW21.4s\n"
- "fmla vV41.4s, vU41.4s, vW11.4s\n"
- "fmla vV31.4s, vU51.4s, vW31.4s\n"
- "str qV31, [vptr2], #0x10\n"
- "fmla vV41.4s, vU51.4s, vW21.4s\n"
- "fmla vV41.4s, vU61.4s, vW31.4s\n"
- "str qV41, [vptr3], #0x10\n"
-
- ".unreq qW22\n" ".unreq qU64\n" ".unreq qU35\n" ".unreq qV41\n"
- ".unreq qU34\n" ".unreq qU21\n" ".unreq qV43\n" ".unreq qW21\n"
- ".unreq qU24\n" ".unreq qU54\n" ".unreq qV31\n" ".unreq qV12\n"
- ".unreq qU61\n" ".unreq qU26\n" ".unreq qV32\n"
- ".unreq qU36\n" ".unreq qU51\n" ".unreq qU66\n" ".unreq qU12\n"
- ".unreq qV14\n" ".unreq qV11\n" ".unreq qU65\n"
- ".unreq qU15\n" ".unreq qU22\n" ".unreq qU45\n"
- ".unreq qV22\n" ".unreq qU14\n"
- ".unreq qU44\n" ".unreq qU43\n" ".unreq qU11\n"
- ".unreq qV24\n" ".unreq qV42\n" ".unreq qW31\n" ".unreq qW13\n"
- ".unreq qU33\n" ".unreq qU62\n" ".unreq qU25\n" ".unreq qU56\n"
- ".unreq qW33\n"
- ".unreq qU42\n" ".unreq qU16\n" ".unreq qV44\n"
- ".unreq qU63\n" ".unreq qU31\n" ".unreq qV34\n"
- ".unreq qW11\n" ".unreq qU41\n" ".unreq qV13\n" ".unreq qV33\n"
- ".unreq qU46\n" ".unreq qU32\n" ".unreq qU13\n"
- ".unreq qW23\n" ".unreq qV23\n" ".unreq qV21\n" ".unreq qU55\n"
- ".unreq qW12\n" ".unreq qW32\n" ".unreq qU23\n" ".unreq qU52\n"
- ".unreq qU53\n" ".unreq vW22\n"
- ".unreq vU64\n" ".unreq vU35\n" ".unreq vV41\n"
- ".unreq vU34\n" ".unreq vU21\n" ".unreq vV43\n" ".unreq vW21\n"
- ".unreq vU24\n" ".unreq vU54\n" ".unreq vV31\n"
- ".unreq vV12\n" ".unreq vU61\n"
- ".unreq vU26\n" ".unreq vV32\n"
- ".unreq vU36\n" ".unreq vU51\n" ".unreq vU66\n" ".unreq vU12\n"
- ".unreq vV14\n" ".unreq vV11\n" ".unreq vU65\n"
- ".unreq vU15\n" ".unreq vU22\n" ".unreq vU45\n"
- ".unreq vV22\n" ".unreq vU14\n"
- ".unreq vU44\n" ".unreq vU43\n" ".unreq vU11\n"
- ".unreq vV24\n" ".unreq vV42\n" ".unreq vW31\n" ".unreq vW13\n"
- ".unreq vU33\n" ".unreq vU62\n" ".unreq vU25\n" ".unreq vU56\n"
- ".unreq vW33\n" ".unreq vU42\n" ".unreq vU16\n" ".unreq vV44\n"
- ".unreq vU63\n" ".unreq vU31\n" ".unreq vV34\n" ".unreq vW11\n"
- ".unreq vU41\n" ".unreq vV13\n" ".unreq vV33\n"
- ".unreq vU46\n" ".unreq vU32\n" ".unreq vU13\n" ".unreq vW23\n"
- ".unreq vV23\n" ".unreq vV21\n" ".unreq vU55\n" ".unreq vW12\n"
- ".unreq vW32\n" ".unreq vU23\n" ".unreq vU52\n" ".unreq vU53\n"
- : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
- [c4_rem] "+r" (c4_rem)
- : [u_row_stride] "r" (in_row_stride * sizeof(float)),
- [u_col_stride1] "r" (in_col_stride * sizeof(float)),
- [v_row_stride] "r" (out_row_stride * sizeof(float)),
- [v_col_stride1] "r" (out_col_stride * sizeof(float)),
- [w_row_stride] "r" (weight_row_stride * sizeof(float)),
- [w_col_stride1] "r" (weight_col_stride * sizeof(float))
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
- "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0",
- "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
- "x12", "x13", "x14", "x15", "x16", "cc", "memory"
- );
- }
- for (; channels_remaining; channels_remaining--)
- {
- // Load input tile
- float u[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
- {
- const float* const inptr_row = uptr0 + (i - in_pad_top)*in_row_stride;
- for (int j = 0; j < inner_tile_cols; j++)
- {
- if (i < in_pad_top || in_cells_i <= i ||
- j < in_pad_left || in_cells_j <= j)
- {
- u[i][j] = static_cast<float>(0);
- }
- else
- {
- u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
- }
- }
- }
- uptr0++;
-
- // Load weights tile
- float w[kernel_rows][kernel_cols];
- for (int i = 0; i < kernel_rows; i++)
- {
- const float* const wptr_row = wptr0 + i*weight_row_stride;
- for (int j = 0; j < kernel_cols; j++)
- {
- w[i][j] = *(wptr_row + j*weight_col_stride);
- }
- }
- wptr0++;
-
- // Perform the convolution
- float v[output_tile_rows][output_tile_cols];
- for (int out_i = 0; out_i < out_cells_i; out_i++)
- {
- for (int out_j = 0; out_j < out_cells_j; out_j++)
- {
- // Clear the accumulator
- v[out_i][out_j] = static_cast<float>(0);
-
- // Base co-ordinate
- const int base_i = out_i * stride_rows;
- const int base_j = out_j * stride_cols;
-
- // Fill the accumulator
- for (int in_i = 0; in_i < kernel_rows; in_i++)
- {
- const int i = base_i + in_i;
- for (int in_j = 0; in_j < kernel_cols; in_j++)
- {
- const int j = base_j + in_j;
- v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- }
- }
- }
- }
-
- // Store the output tile
- for (int i = 0; i < out_cells_i; i++)
- {
- float* const outptr_row = vptr0 + i*out_row_stride;
- for (int j = 0; j < out_cells_j; j++)
- {
- *(outptr_row + j*out_col_stride) = v[i][j];
- }
- }
- vptr0++;
- }
+template <>
+template <>
+void Conv::execute_tile<ActivationFunction::ReLU6>(
+ int n_channels,
+ const void *weight_bias_ptr,
+ const float *input,
+ const unsigned int input_row_stride,
+ const unsigned int input_col_stride,
+ float *output,
+ const unsigned int output_row_stride,
+ const unsigned int output_col_stride
+)
+{
+ __asm __volatile(
+ "add x24, %[inptr0], %[input_row_stride]\n"
+ "add x13, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x8, %[outptr0], %[output_row_stride]\n"
+ "add x9, x24, %[input_row_stride]\n"
+ "add x10, x13, #64\n"
+ "add x19, x13, %[input_col_stride1]\n"
+ "add x20, x9, %[input_row_stride]\n"
+ "add x21, x19, #64\n"
+ "add x17, x19, %[input_col_stride1]\n"
+ "add x22, x20, %[input_row_stride]\n"
+ "add x18, x17, #64\n"
+ "add x11, x17, %[input_col_stride1]\n"
+ "add x23, x22, %[input_row_stride]\n"
+ "add x12, x11, #64\n"
+ "add x25, x8, %[output_row_stride]\n"
+ "add x26, x25, %[output_row_stride]\n"
+ "add x27, %[output_col_stride1], %[output_col_stride1]\n"
+ "and x14, %[n_channels], #3\n"
+ "add x28, x27, %[output_col_stride1]\n"
+ "lsr x15, %[n_channels], #2\n"
+ "cbz x15, 4f\n"
+ "1:\n"
+ "ldr q23, [%[wbptr]]\n"
+ "subs x15, x15, #1\n"
+ "mov v12.16b, v23.16b\n"
+ "ldr q20, [%[wbptr], #16]\n"
+ "mov v8.16b, v23.16b\n"
+ "ldr q6, [%[wbptr], #32]\n"
+ "mov v11.16b, v23.16b\n"
+ "ldr q5, [%[wbptr], #48]\n"
+ "mov v16.16b, v23.16b\n"
+ "ldr q19, [%[wbptr], #64]\n"
+ "mov v7.16b, v23.16b\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "mov v10.16b, v23.16b\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "mov v14.16b, v23.16b\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "mov v15.16b, v23.16b\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "mov v17.16b, v23.16b\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "mov v9.16b, v23.16b\n"
+ "ldr q28, [%[inptr0]]\n"
+ "fmla v12.4s, v28.4s, v20.4s\n"
+ "ldr q25, [x24]\n"
+ "fmla v8.4s, v25.4s, v20.4s\n"
+ "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v11.4s, v18.4s, v20.4s\n"
+ "ldr q30, [x9]\n"
+ "fmla v12.4s, v25.4s, v19.4s\n"
+ "ldr q29, [x24, %[input_col_stride1]]\n"
+ "fmla v8.4s, v30.4s, v19.4s\n"
+ "ldr q24, [%[inptr0], x13]\n"
+ "fmla v16.4s, v30.4s, v20.4s\n"
+ "ldr q27, [x20]\n"
+ "fmla v12.4s, v18.4s, v6.4s\n"
+ "ldr q22, [x9, %[input_col_stride1]]\n"
+ "fmla v8.4s, v29.4s, v6.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v12.4s, v30.4s, v2.4s\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "prfm pldl1keep, [x24, x16]\n"
+ "prfm pldl1keep, [%[inptr0], x10]\n"
+ "prfm pldl1keep, [x20, #64]\n"
+ "prfm pldl1keep, [x9, x16]\n"
+ "fmla v12.4s, v29.4s, v4.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "mov v13.16b, v23.16b\n"
+ "ldr q21, [x24, x13]\n"
+ "mov v18.16b, v23.16b\n"
+ "prfm pldl1keep, [x24, x10]\n"
+ "fmla v11.4s, v29.4s, v19.4s\n"
+ "prfm pldl1keep, [%[inptr0], x21]\n"
+ "fmla v7.4s, v29.4s, v20.4s\n"
+ "ldr q25, [%[inptr0], x19]\n"
+ "fmla v12.4s, v24.4s, v5.4s\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "fmla v11.4s, v24.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x16]\n"
+ "fmla v10.4s, v24.4s, v20.4s\n"
+ "ldr q24, [x22]\n"
+ "fmla v8.4s, v27.4s, v2.4s\n"
+ "prfm pldl1keep, [x9, x10]\n"
+ "fmla v16.4s, v27.4s, v19.4s\n"
+ "prfm pldl1keep, [x24, x21]\n"
+ "fmla v14.4s, v27.4s, v20.4s\n"
+ "ldr q26, [x20, %[input_col_stride1]]\n"
+ "fmla v12.4s, v22.4s, v1.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v8.4s, v22.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, #64]\n"
+ "fmla v11.4s, v22.4s, v2.4s\n"
+ "prfm pldl1keep, [x22, x16]\n"
+ "fmla v16.4s, v22.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x10]\n"
+ "fmla v7.4s, v22.4s, v19.4s\n"
+ "prfm pldl1keep, [x9, x21]\n"
+ "fmla v15.4s, v22.4s, v20.4s\n"
+ "ldr q30, [x9, x13]\n"
+ "fmla v12.4s, v21.4s, v3.4s\n"
+ "prfm pldl1keep, [x24, x18]\n"
+ "fmla v8.4s, v21.4s, v5.4s\n"
+ "prfm pldl1keep, [%[inptr0], x12]\n"
+ "fmla v11.4s, v21.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, x16]\n"
+ "fmla v7.4s, v21.4s, v6.4s\n"
+ "prfm pldl1keep, [x22, x10]\n"
+ "fmla v10.4s, v21.4s, v19.4s\n"
+ "prfm pldl1keep, [x20, x21]\n"
+ "fmla v17.4s, v21.4s, v20.4s\n"
+ "ldr q22, [x24, x19]\n"
+ "fmla v11.4s, v25.4s, v5.4s\n"
+ "prfm pldl1keep, [x9, x18]\n"
+ "fmla v10.4s, v25.4s, v6.4s\n"
+ "prfm pldl1keep, [x24, x12]\n"
+ "fmla v9.4s, v25.4s, v20.4s\n"
+ "ldr q21, [%[inptr0], x17]\n"
+ "fmla v16.4s, v24.4s, v2.4s\n"
+ "prfm pldl1keep, [x23, x10]\n"
+ "fmla v14.4s, v24.4s, v19.4s\n"
+ "ldr q24, [x23]\n"
+ "fmla v8.4s, v26.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x21]\n"
+ "fmla v16.4s, v26.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x18]\n"
+ "fmla v7.4s, v26.4s, v2.4s\n"
+ "prfm pldl1keep, [x9, x12]\n"
+ "fmla v14.4s, v26.4s, v6.4s\n"
+ "prfm pldl1keep, [x23, x21]\n"
+ "fmla v15.4s, v26.4s, v19.4s\n"
+ "prfm pldl1keep, [x22, x18]\n"
+ "fmla v13.4s, v26.4s, v20.4s\n"
+ "ldr q26, [x22, %[input_col_stride1]]\n"
+ "fmla v12.4s, v30.4s, v0.4s\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla v8.4s, v30.4s, v3.4s\n"
+ "prfm pldl1keep, [x23, x18]\n"
+ "fmla v11.4s, v30.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "fmla v16.4s, v30.4s, v5.4s\n"
+ "prfm pldl1keep, [x23, x12]\n"
+ "fmla v7.4s, v30.4s, v4.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v10.4s, v30.4s, v2.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v15.4s, v30.4s, v6.4s\n"
+ "subs x15, x15, #1\n"
+ "fmla v17.4s, v30.4s, v19.4s\n"
+ "fmla v18.4s, v30.4s, v20.4s\n"
+ "mov v25.16b, v23.16b\n"
+ "fmla v11.4s, v22.4s, v3.4s\n"
+ "fmla v7.4s, v22.4s, v5.4s\n"
+ "fmla v10.4s, v22.4s, v4.4s\n"
+ "fmla v17.4s, v22.4s, v6.4s\n"
+ "fmla v9.4s, v22.4s, v19.4s\n"
+ "fmla v25.4s, v22.4s, v20.4s\n"
+ "ldr q27, [x20, x13]\n"
+ "fmla v10.4s, v21.4s, v5.4s\n"
+ "fmla v14.4s, v24.4s, v2.4s\n"
+ "mov v22.16b, v23.16b\n"
+ "fmla v9.4s, v21.4s, v6.4s\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v21.16b, v23.16b\n"
+ "fmla v16.4s, v26.4s, v1.4s\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "fmla v15.4s, v26.4s, v2.4s\n"
+ "fmla v13.4s, v26.4s, v19.4s\n"
+ "fmla v8.4s, v27.4s, v0.4s\n"
+ "ldr q28, [x9, x19]\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "fmla v7.4s, v27.4s, v1.4s\n"
+ "fmla v14.4s, v27.4s, v5.4s\n"
+ "fmla v15.4s, v27.4s, v4.4s\n"
+ "fmla v17.4s, v27.4s, v2.4s\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v18.4s, v27.4s, v19.4s\n"
+ "fmla v22.4s, v27.4s, v20.4s\n"
+ "fmla v11.4s, v28.4s, v0.4s\n"
+ "ldr q29, [x24, x17]\n"
+ "fmla v7.4s, v28.4s, v3.4s\n"
+ "fmla v10.4s, v28.4s, v1.4s\n"
+ "fmla v15.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v28.4s, v4.4s\n"
+ "fmla v9.4s, v28.4s, v2.4s\n"
+ "fmla v18.4s, v28.4s, v6.4s\n"
+ "fmla v25.4s, v28.4s, v19.4s\n"
+ "fmla v24.4s, v28.4s, v20.4s\n"
+ "fmla v10.4s, v29.4s, v3.4s\n"
+ "ldr q23, [%[inptr0], x11]\n"
+ "fmla v17.4s, v29.4s, v5.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v9.4s, v29.4s, v4.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v25.4s, v29.4s, v6.4s\n"
+ "ldr q30, [x23, %[input_col_stride1]]\n"
+ "fmla v14.4s, v30.4s, v1.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v9.4s, v23.4s, v5.4s\n"
+ "ldr q23, [x22, x13]\n"
+ "fmla v13.4s, v30.4s, v2.4s\n"
+ "ldr q29, [x20, x19]\n"
+ "fmla v16.4s, v23.4s, v0.4s\n"
+ "prfm pldl1keep, [%[inptr0], x10]\n"
+ "fmla v14.4s, v23.4s, v3.4s\n"
+ "fmla v15.4s, v23.4s, v1.4s\n"
+ "fmla v13.4s, v23.4s, v4.4s\n"
+ "fmla v18.4s, v23.4s, v2.4s\n"
+ "fmla v22.4s, v23.4s, v19.4s\n"
+ "ldr q23, [x9, x17]\n"
+ "fmla v7.4s, v29.4s, v0.4s\n"
+ "fmla v15.4s, v29.4s, v3.4s\n"
+ "fmla v17.4s, v29.4s, v1.4s\n"
+ "fmla v13.4s, v29.4s, v5.4s\n"
+ "fmla v18.4s, v29.4s, v4.4s\n"
+ "fmla v25.4s, v29.4s, v2.4s\n"
+ "fmla v22.4s, v29.4s, v6.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v21.4s, v29.4s, v20.4s\n"
+ "ldr q26, [x24, x11]\n"
+ "fmla v10.4s, v23.4s, v0.4s\n"
+ "ldr q28, [x23, x13]\n"
+ "fmla v17.4s, v23.4s, v3.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v9.4s, v23.4s, v1.4s\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "fmla v18.4s, v23.4s, v5.4s\n"
+ "prfm pldl1keep, [x24, x16]\n"
+ "fmla v25.4s, v23.4s, v4.4s\n"
+ "fmla v24.4s, v23.4s, v6.4s\n"
+ "fmla v9.4s, v26.4s, v3.4s\n"
+ "ldr q20, [x22, x19]\n"
+ "fmla v14.4s, v28.4s, v0.4s\n"
+ "fmla v13.4s, v28.4s, v1.4s\n"
+ "fmla v25.4s, v26.4s, v5.4s\n"
+ "ldr q26, [x20, x17]\n"
+ "fmla v22.4s, v28.4s, v2.4s\n"
+ "ldr q23, [x9, x11]\n"
+ "fmla v15.4s, v20.4s, v0.4s\n"
+ "add x9, x9, #16\n"
+ "fmla v13.4s, v20.4s, v3.4s\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "fmla v18.4s, v20.4s, v1.4s\n"
+ "prfm pldl1keep, [x9, x16]\n"
+ "fmla v22.4s, v20.4s, v4.4s\n"
+ "fmla v24.4s, v20.4s, v2.4s\n"
+ "fmla v21.4s, v20.4s, v19.4s\n"
+ "ldr q27, [x23, x19]\n"
+ "fmla v17.4s, v26.4s, v0.4s\n"
+ "ldr q20, [x22, x17]\n"
+ "fmla v18.4s, v26.4s, v3.4s\n"
+ "fmla v25.4s, v26.4s, v1.4s\n"
+ "fmla v22.4s, v26.4s, v5.4s\n"
+ "fmla v24.4s, v26.4s, v4.4s\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr q19, [x20, x11]\n"
+ "fmla v9.4s, v23.4s, v0.4s\n"
+ "ldr q28, [x23, x17]\n"
+ "fmla v25.4s, v23.4s, v3.4s\n"
+ "add x20, x20, #16\n"
+ "fmla v24.4s, v23.4s, v5.4s\n"
+ "ldr q29, [x22, x11]\n"
+ "fmla v13.4s, v27.4s, v0.4s\n"
+ "prfm pldl1keep, [x20, #64]\n"
+ "fmla v22.4s, v27.4s, v1.4s\n"
+ "add x22, x22, #16\n"
+ "fmla v21.4s, v27.4s, v2.4s\n"
+ "ldr q30, [x23, x11]\n"
+ "fmla v18.4s, v20.4s, v0.4s\n"
+ "ldr q23, [%[wbptr]]\n"
+ "fmla v22.4s, v20.4s, v3.4s\n"
+ "add x23, x23, #16\n"
+ "fmla v24.4s, v20.4s, v1.4s\n"
+ "fmla v21.4s, v20.4s, v4.4s\n"
+ "fmla v25.4s, v19.4s, v0.4s\n"
+ "ldr q20, [%[wbptr], #16]\n"
+ "fmla v22.4s, v28.4s, v0.4s\n"
+ "ldr q6, [%[wbptr], #32]\n"
+ "fmla v21.4s, v19.4s, v5.4s\n"
+ "movi v26.16b, #0\n"
+ "fmla v24.4s, v19.4s, v3.4s\n"
+ "ldr q19, [%[wbptr], #64]\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v11.4s, v11.4s, v26.4s\n"
+ "fmla v21.4s, v28.4s, v1.4s\n"
+ "ldr q5, [%[wbptr], #48]\n"
+ "fmla v24.4s, v29.4s, v0.4s\n"
+ "ldr q4, [%[wbptr], #80]\n"
+ "fmax v10.4s, v10.4s, v26.4s\n"
+ "fmax v9.4s, v9.4s, v26.4s\n"
+ "fmla v21.4s, v29.4s, v3.4s\n"
+ "ldr q2, [%[wbptr], #112]\n"
+ "fmov v27.4s, #6.0\n"
+ "fmax v8.4s, v8.4s, v26.4s\n"
+ "fmax v7.4s, v7.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmla v21.4s, v30.4s, v0.4s\n"
+ "ldr q3, [%[wbptr], #96]\n"
+ "fmin v12.4s, v12.4s, v27.4s\n"
+ "ldr q1, [%[wbptr], #128]\n"
+ "fmin v11.4s, v11.4s, v27.4s\n"
+ "fmin v10.4s, v10.4s, v27.4s\n"
+ "str q12, [%[outptr0]]\n"
+ "fmin v9.4s, v9.4s, v27.4s\n"
+ "str q11, [%[outptr0], %[output_col_stride1]]\n"
+ "fmin v8.4s, v8.4s, v27.4s\n"
+ "str q10, [%[outptr0], x27]\n"
+ "fmin v7.4s, v7.4s, v27.4s\n"
+ "str q9, [%[outptr0], x28]\n"
+ "fmin v17.4s, v17.4s, v27.4s\n"
+ "str q8, [x8]\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "str q7, [x8, %[output_col_stride1]]\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "str q17, [x8, x27]\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "fmin v16.4s, v16.4s, v27.4s\n"
+ "ldr q0, [%[wbptr], #144]\n"
+ "str q25, [x8, x28]\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "str q16, [x25]\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v27.4s\n"
+ "ldr q28, [%[inptr0]]\n"
+ "fmin v18.4s, v18.4s, v27.4s\n"
+ "ldr q25, [x24]\n"
+ "str q15, [x25, %[output_col_stride1]]\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "str q18, [x25, x27]\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
+ "fmin v14.4s, v14.4s, v27.4s\n"
+ "ldr q30, [x9]\n"
+ "str q24, [x25, x28]\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "str q14, [x26]\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v27.4s\n"
+ "ldr q29, [x24, %[input_col_stride1]]\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
+ "ldr q24, [%[inptr0], x13]\n"
+ "str q13, [x26, %[output_col_stride1]]\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "str q22, [x26, x27]\n"
+ "mov v12.16b, v23.16b\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "ldr q27, [x20]\n"
+ "mov v8.16b, v23.16b\n"
+ "ldr q22, [x9, %[input_col_stride1]]\n"
+ "str q21, [x26, x28]\n"
+ "mov v11.16b, v23.16b\n"
+ "mov v16.16b, v23.16b\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "mov v7.16b, v23.16b\n"
+ "add x8, x8, #16\n"
+ "mov v10.16b, v23.16b\n"
+ "add x25, x25, #16\n"
+ "mov v14.16b, v23.16b\n"
+ "add x26, x26, #16\n"
+ "mov v15.16b, v23.16b\n"
+ "mov v17.16b, v23.16b\n"
+ "mov v9.16b, v23.16b\n"
+ "fmla v12.4s, v28.4s, v20.4s\n"
+ "fmla v8.4s, v25.4s, v20.4s\n"
+ "fmla v11.4s, v18.4s, v20.4s\n"
+ "fmla v16.4s, v30.4s, v20.4s\n"
+ "fmla v12.4s, v25.4s, v19.4s\n"
+ "fmla v8.4s, v30.4s, v19.4s\n"
+ "fmla v12.4s, v18.4s, v6.4s\n"
+ "fmla v8.4s, v29.4s, v6.4s\n"
+ "fmla v12.4s, v30.4s, v2.4s\n"
+ "fmla v12.4s, v29.4s, v4.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "mov v13.16b, v23.16b\n"
+ "ldr q21, [x24, x13]\n"
+ "mov v18.16b, v23.16b\n"
+ "prfm pldl1keep, [x24, x10]\n"
+ "fmla v11.4s, v29.4s, v19.4s\n"
+ "prfm pldl1keep, [%[inptr0], x21]\n"
+ "fmla v7.4s, v29.4s, v20.4s\n"
+ "ldr q25, [%[inptr0], x19]\n"
+ "fmla v12.4s, v24.4s, v5.4s\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "fmla v11.4s, v24.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x16]\n"
+ "fmla v10.4s, v24.4s, v20.4s\n"
+ "ldr q24, [x22]\n"
+ "fmla v8.4s, v27.4s, v2.4s\n"
+ "prfm pldl1keep, [x9, x10]\n"
+ "fmla v16.4s, v27.4s, v19.4s\n"
+ "prfm pldl1keep, [x24, x21]\n"
+ "fmla v14.4s, v27.4s, v20.4s\n"
+ "ldr q26, [x20, %[input_col_stride1]]\n"
+ "fmla v12.4s, v22.4s, v1.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v8.4s, v22.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, #64]\n"
+ "fmla v11.4s, v22.4s, v2.4s\n"
+ "prfm pldl1keep, [x22, x16]\n"
+ "fmla v16.4s, v22.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x10]\n"
+ "fmla v7.4s, v22.4s, v19.4s\n"
+ "prfm pldl1keep, [x9, x21]\n"
+ "fmla v15.4s, v22.4s, v20.4s\n"
+ "ldr q30, [x9, x13]\n"
+ "fmla v12.4s, v21.4s, v3.4s\n"
+ "prfm pldl1keep, [x24, x18]\n"
+ "fmla v8.4s, v21.4s, v5.4s\n"
+ "prfm pldl1keep, [%[inptr0], x12]\n"
+ "fmla v11.4s, v21.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, x16]\n"
+ "fmla v7.4s, v21.4s, v6.4s\n"
+ "prfm pldl1keep, [x22, x10]\n"
+ "fmla v10.4s, v21.4s, v19.4s\n"
+ "prfm pldl1keep, [x20, x21]\n"
+ "fmla v17.4s, v21.4s, v20.4s\n"
+ "ldr q22, [x24, x19]\n"
+ "fmla v11.4s, v25.4s, v5.4s\n"
+ "prfm pldl1keep, [x9, x18]\n"
+ "fmla v10.4s, v25.4s, v6.4s\n"
+ "prfm pldl1keep, [x24, x12]\n"
+ "fmla v9.4s, v25.4s, v20.4s\n"
+ "ldr q21, [%[inptr0], x17]\n"
+ "fmla v16.4s, v24.4s, v2.4s\n"
+ "prfm pldl1keep, [x23, x10]\n"
+ "fmla v14.4s, v24.4s, v19.4s\n"
+ "ldr q24, [x23]\n"
+ "fmla v8.4s, v26.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x21]\n"
+ "fmla v16.4s, v26.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x18]\n"
+ "fmla v7.4s, v26.4s, v2.4s\n"
+ "prfm pldl1keep, [x9, x12]\n"
+ "fmla v14.4s, v26.4s, v6.4s\n"
+ "prfm pldl1keep, [x23, x21]\n"
+ "fmla v15.4s, v26.4s, v19.4s\n"
+ "prfm pldl1keep, [x22, x18]\n"
+ "fmla v13.4s, v26.4s, v20.4s\n"
+ "ldr q26, [x22, %[input_col_stride1]]\n"
+ "fmla v12.4s, v30.4s, v0.4s\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla v8.4s, v30.4s, v3.4s\n"
+ "prfm pldl1keep, [x23, x18]\n"
+ "fmla v11.4s, v30.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "fmla v16.4s, v30.4s, v5.4s\n"
+ "prfm pldl1keep, [x23, x12]\n"
+ "fmla v7.4s, v30.4s, v4.4s\n"
+ "add %[wbptr], %[wbptr], #160\n"
+ "fmla v10.4s, v30.4s, v2.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v15.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v19.4s\n"
+ "fmla v18.4s, v30.4s, v20.4s\n"
+ "ldr q27, [x20, x13]\n"
+ "fmla v11.4s, v22.4s, v3.4s\n"
+ "fmla v7.4s, v22.4s, v5.4s\n"
+ "fmla v10.4s, v22.4s, v4.4s\n"
+ "fmla v17.4s, v22.4s, v6.4s\n"
+ "fmla v9.4s, v22.4s, v19.4s\n"
+ "fmla v14.4s, v24.4s, v2.4s\n"
+ "mov v25.16b, v23.16b\n"
+ "fmla v16.4s, v26.4s, v1.4s\n"
+ "fmla v10.4s, v21.4s, v5.4s\n"
+ "fmla v15.4s, v26.4s, v2.4s\n"
+ "fmla v25.4s, v22.4s, v20.4s\n"
+ "ldr q28, [x9, x19]\n"
+ "fmla v9.4s, v21.4s, v6.4s\n"
+ "ldr q29, [x24, x17]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "fmla v13.4s, v26.4s, v19.4s\n"
+ "mov v22.16b, v23.16b\n"
+ "fmla v8.4s, v27.4s, v0.4s\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "fmla v7.4s, v27.4s, v1.4s\n"
+ "fmla v14.4s, v27.4s, v5.4s\n"
+ "fmla v15.4s, v27.4s, v4.4s\n"
+ "fmla v17.4s, v27.4s, v2.4s\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v18.4s, v27.4s, v19.4s\n"
+ "fmla v22.4s, v27.4s, v20.4s\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v21.16b, v23.16b\n"
+ "fmla v11.4s, v28.4s, v0.4s\n"
+ "fmla v7.4s, v28.4s, v3.4s\n"
+ "fmla v10.4s, v28.4s, v1.4s\n"
+ "fmla v15.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v28.4s, v4.4s\n"
+ "fmla v9.4s, v28.4s, v2.4s\n"
+ "fmla v18.4s, v28.4s, v6.4s\n"
+ "fmla v25.4s, v28.4s, v19.4s\n"
+ "fmla v24.4s, v28.4s, v20.4s\n"
+ "ldr q23, [%[inptr0], x11]\n"
+ "fmla v10.4s, v29.4s, v3.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fmla v17.4s, v29.4s, v5.4s\n"
+ "fmla v9.4s, v29.4s, v4.4s\n"
+ "fmla v25.4s, v29.4s, v6.4s\n"
+ "ldr q30, [x23, %[input_col_stride1]]\n"
+ "fmla v14.4s, v30.4s, v1.4s\n"
+ "fmla v13.4s, v30.4s, v2.4s\n"
+ "fmla v9.4s, v23.4s, v5.4s\n"
+ "ldr q23, [x22, x13]\n"
+ "fmla v16.4s, v23.4s, v0.4s\n"
+ "ldr q29, [x20, x19]\n"
+ "fmla v14.4s, v23.4s, v3.4s\n"
+ "fmla v15.4s, v23.4s, v1.4s\n"
+ "fmla v13.4s, v23.4s, v4.4s\n"
+ "fmla v18.4s, v23.4s, v2.4s\n"
+ "fmla v22.4s, v23.4s, v19.4s\n"
+ "ldr q23, [x9, x17]\n"
+ "fmla v7.4s, v29.4s, v0.4s\n"
+ "fmla v15.4s, v29.4s, v3.4s\n"
+ "fmla v17.4s, v29.4s, v1.4s\n"
+ "fmla v13.4s, v29.4s, v5.4s\n"
+ "fmla v18.4s, v29.4s, v4.4s\n"
+ "fmla v25.4s, v29.4s, v2.4s\n"
+ "fmla v22.4s, v29.4s, v6.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v21.4s, v29.4s, v20.4s\n"
+ "ldr q26, [x24, x11]\n"
+ "fmla v10.4s, v23.4s, v0.4s\n"
+ "ldr q28, [x23, x13]\n"
+ "fmla v17.4s, v23.4s, v3.4s\n"
+ "add x24, x24, #16\n"
+ "fmla v9.4s, v23.4s, v1.4s\n"
+ "fmla v18.4s, v23.4s, v5.4s\n"
+ "fmla v25.4s, v23.4s, v4.4s\n"
+ "fmla v24.4s, v23.4s, v6.4s\n"
+ "fmla v14.4s, v28.4s, v0.4s\n"
+ "ldr q20, [x22, x19]\n"
+ "fmla v9.4s, v26.4s, v3.4s\n"
+ "fmla v13.4s, v28.4s, v1.4s\n"
+ "fmla v25.4s, v26.4s, v5.4s\n"
+ "ldr q26, [x20, x17]\n"
+ "fmla v22.4s, v28.4s, v2.4s\n"
+ "ldr q23, [x9, x11]\n"
+ "fmla v15.4s, v20.4s, v0.4s\n"
+ "add x9, x9, #16\n"
+ "fmla v13.4s, v20.4s, v3.4s\n"
+ "fmla v18.4s, v20.4s, v1.4s\n"
+ "fmla v22.4s, v20.4s, v4.4s\n"
+ "fmla v24.4s, v20.4s, v2.4s\n"
+ "fmla v21.4s, v20.4s, v19.4s\n"
+ "ldr q27, [x23, x19]\n"
+ "fmla v17.4s, v26.4s, v0.4s\n"
+ "ldr q20, [x22, x17]\n"
+ "fmla v18.4s, v26.4s, v3.4s\n"
+ "fmla v25.4s, v26.4s, v1.4s\n"
+ "fmla v22.4s, v26.4s, v5.4s\n"
+ "fmla v24.4s, v26.4s, v4.4s\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr q19, [x20, x11]\n"
+ "fmla v9.4s, v23.4s, v0.4s\n"
+ "ldr q28, [x23, x17]\n"
+ "fmla v25.4s, v23.4s, v3.4s\n"
+ "add x20, x20, #16\n"
+ "fmla v24.4s, v23.4s, v5.4s\n"
+ "ldr q29, [x22, x11]\n"
+ "fmla v13.4s, v27.4s, v0.4s\n"
+ "add x22, x22, #16\n"
+ "fmla v22.4s, v27.4s, v1.4s\n"
+ "fmla v21.4s, v27.4s, v2.4s\n"
+ "fmla v18.4s, v20.4s, v0.4s\n"
+ "ldr q30, [x23, x11]\n"
+ "fmla v24.4s, v20.4s, v1.4s\n"
+ "add x23, x23, #16\n"
+ "fmla v22.4s, v20.4s, v3.4s\n"
+ "fmla v21.4s, v20.4s, v4.4s\n"
+ "fmla v25.4s, v19.4s, v0.4s\n"
+ "movi v26.16b, #0\n"
+ "fmla v24.4s, v19.4s, v3.4s\n"
+ "fmov v27.4s, #6.0\n"
+ "fmla v21.4s, v19.4s, v5.4s\n"
+ "fmla v22.4s, v28.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v11.4s, v11.4s, v26.4s\n"
+ "fmla v24.4s, v29.4s, v0.4s\n"
+ "fmax v10.4s, v10.4s, v26.4s\n"
+ "fmla v21.4s, v28.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v27.4s\n"
+ "fmin v11.4s, v11.4s, v27.4s\n"
+ "fmin v10.4s, v10.4s, v27.4s\n"
+ "str q12, [%[outptr0]]\n"
+ "fmax v9.4s, v9.4s, v26.4s\n"
+ "str q11, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v21.4s, v29.4s, v3.4s\n"
+ "str q10, [%[outptr0], x27]\n"
+ "fmin v9.4s, v9.4s, v27.4s\n"
+ "fmax v8.4s, v8.4s, v26.4s\n"
+ "fmax v7.4s, v7.4s, v26.4s\n"
+ "str q9, [%[outptr0], x28]\n"
+ "fmla v21.4s, v30.4s, v0.4s\n"
+ "fmin v8.4s, v8.4s, v27.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fmin v7.4s, v7.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "str q8, [x8]\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "str q7, [x8, %[output_col_stride1]]\n"
+ "fmin v17.4s, v17.4s, v27.4s\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "str q17, [x8, x27]\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "str q25, [x8, x28]\n"
+ "fmin v16.4s, v16.4s, v27.4s\n"
+ "fmin v15.4s, v15.4s, v27.4s\n"
+ "add x8, x8, #16\n"
+ "str q16, [x25]\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "str q15, [x25, %[output_col_stride1]]\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v27.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "str q18, [x25, x27]\n"
+ "fmin v14.4s, v14.4s, v27.4s\n"
+ "str q24, [x25, x28]\n"
+ "fmin v13.4s, v13.4s, v27.4s\n"
+ "str q14, [x26]\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
+ "str q13, [x26, %[output_col_stride1]]\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
+ "add x25, x25, #16\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "str q22, [x26, x27]\n"
+ "str q21, [x26, x28]\n"
+ "add x26, x26, #16\n"
+ "4:\n"
+ "cbz x14, 7f\n"
+ "ldr s23, [%[wbptr]]\n"
+ "mov v12.16b, v23.16b\n"
+ "ldr s20, [%[wbptr], #4]\n"
+ "mov v8.16b, v23.16b\n"
+ "ldr s6, [%[wbptr], #8]\n"
+ "mov v11.16b, v23.16b\n"
+ "ldr s5, [%[wbptr], #12]\n"
+ "mov v16.16b, v23.16b\n"
+ "ldr s19, [%[wbptr], #16]\n"
+ "mov v7.16b, v23.16b\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "mov v10.16b, v23.16b\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "mov v14.16b, v23.16b\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "mov v15.16b, v23.16b\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "mov v17.16b, v23.16b\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "mov v9.16b, v23.16b\n"
+ "ldr s28, [%[inptr0]]\n"
+ "fmla v12.4s, v28.4s, v20.4s\n"
+ "ldr s25, [x24]\n"
+ "fmla v8.4s, v25.4s, v20.4s\n"
+ "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
+ "fmla v11.4s, v18.4s, v20.4s\n"
+ "ldr s30, [x9]\n"
+ "fmla v12.4s, v25.4s, v19.4s\n"
+ "ldr s29, [x24, %[input_col_stride1]]\n"
+ "fmla v8.4s, v30.4s, v19.4s\n"
+ "ldr s24, [%[inptr0], x13]\n"
+ "fmla v16.4s, v30.4s, v20.4s\n"
+ "ldr s27, [x20]\n"
+ "fmla v12.4s, v18.4s, v6.4s\n"
+ "ldr s22, [x9, %[input_col_stride1]]\n"
+ "fmla v8.4s, v29.4s, v6.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "subs x14, x14, #1\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "fmla v12.4s, v30.4s, v2.4s\n"
+ "prfm pldl1keep, [x24, x16]\n"
+ "prfm pldl1keep, [%[inptr0], x10]\n"
+ "prfm pldl1keep, [x20, #64]\n"
+ "prfm pldl1keep, [x9, x16]\n"
+ "fmla v12.4s, v29.4s, v4.4s\n"
+ "beq 6f\n"
+ "5:\n"
+ "mov v13.16b, v23.16b\n"
+ "ldr s21, [x24, x13]\n"
+ "mov v18.16b, v23.16b\n"
+ "prfm pldl1keep, [x24, x10]\n"
+ "fmla v11.4s, v29.4s, v19.4s\n"
+ "prfm pldl1keep, [%[inptr0], x21]\n"
+ "fmla v7.4s, v29.4s, v20.4s\n"
+ "ldr s25, [%[inptr0], x19]\n"
+ "fmla v12.4s, v24.4s, v5.4s\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "fmla v11.4s, v24.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x16]\n"
+ "fmla v10.4s, v24.4s, v20.4s\n"
+ "ldr s24, [x22]\n"
+ "fmla v8.4s, v27.4s, v2.4s\n"
+ "prfm pldl1keep, [x9, x10]\n"
+ "fmla v16.4s, v27.4s, v19.4s\n"
+ "prfm pldl1keep, [x24, x21]\n"
+ "fmla v14.4s, v27.4s, v20.4s\n"
+ "ldr s26, [x20, %[input_col_stride1]]\n"
+ "fmla v12.4s, v22.4s, v1.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v8.4s, v22.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, #64]\n"
+ "fmla v11.4s, v22.4s, v2.4s\n"
+ "prfm pldl1keep, [x22, x16]\n"
+ "fmla v16.4s, v22.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x10]\n"
+ "fmla v7.4s, v22.4s, v19.4s\n"
+ "prfm pldl1keep, [x9, x21]\n"
+ "fmla v15.4s, v22.4s, v20.4s\n"
+ "ldr s30, [x9, x13]\n"
+ "fmla v12.4s, v21.4s, v3.4s\n"
+ "prfm pldl1keep, [x24, x18]\n"
+ "fmla v8.4s, v21.4s, v5.4s\n"
+ "prfm pldl1keep, [%[inptr0], x12]\n"
+ "fmla v11.4s, v21.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, x16]\n"
+ "fmla v7.4s, v21.4s, v6.4s\n"
+ "prfm pldl1keep, [x22, x10]\n"
+ "fmla v10.4s, v21.4s, v19.4s\n"
+ "prfm pldl1keep, [x20, x21]\n"
+ "fmla v17.4s, v21.4s, v20.4s\n"
+ "ldr s22, [x24, x19]\n"
+ "fmla v11.4s, v25.4s, v5.4s\n"
+ "prfm pldl1keep, [x9, x18]\n"
+ "fmla v10.4s, v25.4s, v6.4s\n"
+ "prfm pldl1keep, [x24, x12]\n"
+ "fmla v9.4s, v25.4s, v20.4s\n"
+ "ldr s21, [%[inptr0], x17]\n"
+ "fmla v16.4s, v24.4s, v2.4s\n"
+ "prfm pldl1keep, [x23, x10]\n"
+ "fmla v14.4s, v24.4s, v19.4s\n"
+ "ldr s24, [x23]\n"
+ "fmla v8.4s, v26.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x21]\n"
+ "fmla v16.4s, v26.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x18]\n"
+ "fmla v7.4s, v26.4s, v2.4s\n"
+ "prfm pldl1keep, [x9, x12]\n"
+ "fmla v14.4s, v26.4s, v6.4s\n"
+ "prfm pldl1keep, [x23, x21]\n"
+ "fmla v15.4s, v26.4s, v19.4s\n"
+ "prfm pldl1keep, [x22, x18]\n"
+ "fmla v13.4s, v26.4s, v20.4s\n"
+ "ldr s26, [x22, %[input_col_stride1]]\n"
+ "fmla v12.4s, v30.4s, v0.4s\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla v8.4s, v30.4s, v3.4s\n"
+ "prfm pldl1keep, [x23, x18]\n"
+ "fmla v11.4s, v30.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "fmla v16.4s, v30.4s, v5.4s\n"
+ "prfm pldl1keep, [x23, x12]\n"
+ "fmla v7.4s, v30.4s, v4.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v10.4s, v30.4s, v2.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v15.4s, v30.4s, v6.4s\n"
+ "subs x14, x14, #1\n"
+ "fmla v17.4s, v30.4s, v19.4s\n"
+ "fmla v18.4s, v30.4s, v20.4s\n"
+ "mov v25.16b, v23.16b\n"
+ "fmla v11.4s, v22.4s, v3.4s\n"
+ "fmla v7.4s, v22.4s, v5.4s\n"
+ "fmla v10.4s, v22.4s, v4.4s\n"
+ "fmla v17.4s, v22.4s, v6.4s\n"
+ "fmla v9.4s, v22.4s, v19.4s\n"
+ "fmla v25.4s, v22.4s, v20.4s\n"
+ "ldr s27, [x20, x13]\n"
+ "fmla v10.4s, v21.4s, v5.4s\n"
+ "fmla v14.4s, v24.4s, v2.4s\n"
+ "mov v22.16b, v23.16b\n"
+ "fmla v9.4s, v21.4s, v6.4s\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v21.16b, v23.16b\n"
+ "fmla v16.4s, v26.4s, v1.4s\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "fmla v15.4s, v26.4s, v2.4s\n"
+ "fmla v13.4s, v26.4s, v19.4s\n"
+ "fmla v8.4s, v27.4s, v0.4s\n"
+ "ldr s28, [x9, x19]\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "fmla v7.4s, v27.4s, v1.4s\n"
+ "fmla v14.4s, v27.4s, v5.4s\n"
+ "fmla v15.4s, v27.4s, v4.4s\n"
+ "fmla v17.4s, v27.4s, v2.4s\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v18.4s, v27.4s, v19.4s\n"
+ "fmla v22.4s, v27.4s, v20.4s\n"
+ "fmla v11.4s, v28.4s, v0.4s\n"
+ "ldr s29, [x24, x17]\n"
+ "fmla v7.4s, v28.4s, v3.4s\n"
+ "fmla v10.4s, v28.4s, v1.4s\n"
+ "fmla v15.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v28.4s, v4.4s\n"
+ "fmla v9.4s, v28.4s, v2.4s\n"
+ "fmla v18.4s, v28.4s, v6.4s\n"
+ "fmla v25.4s, v28.4s, v19.4s\n"
+ "fmla v24.4s, v28.4s, v20.4s\n"
+ "fmla v10.4s, v29.4s, v3.4s\n"
+ "ldr s23, [%[inptr0], x11]\n"
+ "fmla v17.4s, v29.4s, v5.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v9.4s, v29.4s, v4.4s\n"
+ "prfm pldl1keep, [%[inptr0], #64]\n"
+ "fmla v25.4s, v29.4s, v6.4s\n"
+ "ldr s30, [x23, %[input_col_stride1]]\n"
+ "fmla v14.4s, v30.4s, v1.4s\n"
+ "prfm pldl1keep, [%[inptr0], x16]\n"
+ "fmla v9.4s, v23.4s, v5.4s\n"
+ "ldr s23, [x22, x13]\n"
+ "fmla v13.4s, v30.4s, v2.4s\n"
+ "ldr s29, [x20, x19]\n"
+ "fmla v16.4s, v23.4s, v0.4s\n"
+ "prfm pldl1keep, [%[inptr0], x10]\n"
+ "fmla v14.4s, v23.4s, v3.4s\n"
+ "fmla v15.4s, v23.4s, v1.4s\n"
+ "fmla v13.4s, v23.4s, v4.4s\n"
+ "fmla v18.4s, v23.4s, v2.4s\n"
+ "fmla v22.4s, v23.4s, v19.4s\n"
+ "ldr s23, [x9, x17]\n"
+ "fmla v7.4s, v29.4s, v0.4s\n"
+ "fmla v15.4s, v29.4s, v3.4s\n"
+ "fmla v17.4s, v29.4s, v1.4s\n"
+ "fmla v13.4s, v29.4s, v5.4s\n"
+ "fmla v18.4s, v29.4s, v4.4s\n"
+ "fmla v25.4s, v29.4s, v2.4s\n"
+ "fmla v22.4s, v29.4s, v6.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v21.4s, v29.4s, v20.4s\n"
+ "ldr s26, [x24, x11]\n"
+ "fmla v10.4s, v23.4s, v0.4s\n"
+ "ldr s28, [x23, x13]\n"
+ "fmla v17.4s, v23.4s, v3.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v9.4s, v23.4s, v1.4s\n"
+ "prfm pldl1keep, [x24, #64]\n"
+ "fmla v18.4s, v23.4s, v5.4s\n"
+ "prfm pldl1keep, [x24, x16]\n"
+ "fmla v25.4s, v23.4s, v4.4s\n"
+ "fmla v24.4s, v23.4s, v6.4s\n"
+ "fmla v9.4s, v26.4s, v3.4s\n"
+ "ldr s20, [x22, x19]\n"
+ "fmla v14.4s, v28.4s, v0.4s\n"
+ "fmla v13.4s, v28.4s, v1.4s\n"
+ "fmla v25.4s, v26.4s, v5.4s\n"
+ "ldr s26, [x20, x17]\n"
+ "fmla v22.4s, v28.4s, v2.4s\n"
+ "ldr s23, [x9, x11]\n"
+ "fmla v15.4s, v20.4s, v0.4s\n"
+ "add x9, x9, #4\n"
+ "fmla v13.4s, v20.4s, v3.4s\n"
+ "prfm pldl1keep, [x9, #64]\n"
+ "fmla v18.4s, v20.4s, v1.4s\n"
+ "prfm pldl1keep, [x9, x16]\n"
+ "fmla v22.4s, v20.4s, v4.4s\n"
+ "fmla v24.4s, v20.4s, v2.4s\n"
+ "fmla v21.4s, v20.4s, v19.4s\n"
+ "ldr s27, [x23, x19]\n"
+ "fmla v17.4s, v26.4s, v0.4s\n"
+ "ldr s20, [x22, x17]\n"
+ "fmla v18.4s, v26.4s, v3.4s\n"
+ "fmla v25.4s, v26.4s, v1.4s\n"
+ "fmla v22.4s, v26.4s, v5.4s\n"
+ "fmla v24.4s, v26.4s, v4.4s\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr s19, [x20, x11]\n"
+ "fmla v9.4s, v23.4s, v0.4s\n"
+ "ldr s28, [x23, x17]\n"
+ "fmla v25.4s, v23.4s, v3.4s\n"
+ "add x20, x20, #4\n"
+ "fmla v24.4s, v23.4s, v5.4s\n"
+ "ldr s29, [x22, x11]\n"
+ "fmla v13.4s, v27.4s, v0.4s\n"
+ "prfm pldl1keep, [x20, #64]\n"
+ "fmla v22.4s, v27.4s, v1.4s\n"
+ "add x22, x22, #4\n"
+ "fmla v21.4s, v27.4s, v2.4s\n"
+ "ldr s30, [x23, x11]\n"
+ "fmla v18.4s, v20.4s, v0.4s\n"
+ "ldr s23, [%[wbptr]]\n"
+ "fmla v22.4s, v20.4s, v3.4s\n"
+ "add x23, x23, #4\n"
+ "fmla v24.4s, v20.4s, v1.4s\n"
+ "fmla v21.4s, v20.4s, v4.4s\n"
+ "fmla v25.4s, v19.4s, v0.4s\n"
+ "ldr s20, [%[wbptr], #4]\n"
+ "fmla v22.4s, v28.4s, v0.4s\n"
+ "ldr s6, [%[wbptr], #8]\n"
+ "fmla v21.4s, v19.4s, v5.4s\n"
+ "movi v26.16b, #0\n"
+ "fmla v24.4s, v19.4s, v3.4s\n"
+ "ldr s19, [%[wbptr], #16]\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v11.4s, v11.4s, v26.4s\n"
+ "fmla v21.4s, v28.4s, v1.4s\n"
+ "ldr s5, [%[wbptr], #12]\n"
+ "fmla v24.4s, v29.4s, v0.4s\n"
+ "ldr s4, [%[wbptr], #20]\n"
+ "fmax v10.4s, v10.4s, v26.4s\n"
+ "fmax v9.4s, v9.4s, v26.4s\n"
+ "fmla v21.4s, v29.4s, v3.4s\n"
+ "ldr s2, [%[wbptr], #28]\n"
+ "fmov v27.4s, #6.0\n"
+ "fmax v8.4s, v8.4s, v26.4s\n"
+ "fmax v7.4s, v7.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmla v21.4s, v30.4s, v0.4s\n"
+ "ldr s3, [%[wbptr], #24]\n"
+ "fmin v12.4s, v12.4s, v27.4s\n"
+ "ldr s1, [%[wbptr], #32]\n"
+ "fmin v11.4s, v11.4s, v27.4s\n"
+ "fmin v10.4s, v10.4s, v27.4s\n"
+ "str s12, [%[outptr0]]\n"
+ "fmin v9.4s, v9.4s, v27.4s\n"
+ "str s11, [%[outptr0], %[output_col_stride1]]\n"
+ "fmin v8.4s, v8.4s, v27.4s\n"
+ "str s10, [%[outptr0], x27]\n"
+ "fmin v7.4s, v7.4s, v27.4s\n"
+ "str s9, [%[outptr0], x28]\n"
+ "fmin v17.4s, v17.4s, v27.4s\n"
+ "str s8, [x8]\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "str s7, [x8, %[output_col_stride1]]\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "str s17, [x8, x27]\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "fmin v16.4s, v16.4s, v27.4s\n"
+ "ldr s0, [%[wbptr], #36]\n"
+ "str s25, [x8, x28]\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "str s16, [x25]\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v27.4s\n"
+ "ldr s28, [%[inptr0]]\n"
+ "fmin v18.4s, v18.4s, v27.4s\n"
+ "ldr s25, [x24]\n"
+ "str s15, [x25, %[output_col_stride1]]\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "str s18, [x25, x27]\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
+ "fmin v14.4s, v14.4s, v27.4s\n"
+ "ldr s30, [x9]\n"
+ "str s24, [x25, x28]\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "str s14, [x26]\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
+ "fmin v13.4s, v13.4s, v27.4s\n"
+ "ldr s29, [x24, %[input_col_stride1]]\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
+ "ldr s24, [%[inptr0], x13]\n"
+ "str s13, [x26, %[output_col_stride1]]\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "str s22, [x26, x27]\n"
+ "mov v12.16b, v23.16b\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "ldr s27, [x20]\n"
+ "mov v8.16b, v23.16b\n"
+ "ldr s22, [x9, %[input_col_stride1]]\n"
+ "str s21, [x26, x28]\n"
+ "mov v11.16b, v23.16b\n"
+ "mov v16.16b, v23.16b\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "mov v7.16b, v23.16b\n"
+ "add x8, x8, #4\n"
+ "mov v10.16b, v23.16b\n"
+ "add x25, x25, #4\n"
+ "mov v14.16b, v23.16b\n"
+ "add x26, x26, #4\n"
+ "mov v15.16b, v23.16b\n"
+ "mov v17.16b, v23.16b\n"
+ "mov v9.16b, v23.16b\n"
+ "fmla v12.4s, v28.4s, v20.4s\n"
+ "fmla v8.4s, v25.4s, v20.4s\n"
+ "fmla v11.4s, v18.4s, v20.4s\n"
+ "fmla v16.4s, v30.4s, v20.4s\n"
+ "fmla v12.4s, v25.4s, v19.4s\n"
+ "fmla v8.4s, v30.4s, v19.4s\n"
+ "fmla v12.4s, v18.4s, v6.4s\n"
+ "fmla v8.4s, v29.4s, v6.4s\n"
+ "fmla v12.4s, v30.4s, v2.4s\n"
+ "fmla v12.4s, v29.4s, v4.4s\n"
+ "bne 5b\n"
+ "6:\n"
+ "mov v13.16b, v23.16b\n"
+ "ldr s21, [x24, x13]\n"
+ "mov v18.16b, v23.16b\n"
+ "prfm pldl1keep, [x24, x10]\n"
+ "fmla v11.4s, v29.4s, v19.4s\n"
+ "prfm pldl1keep, [%[inptr0], x21]\n"
+ "fmla v7.4s, v29.4s, v20.4s\n"
+ "ldr s25, [%[inptr0], x19]\n"
+ "fmla v12.4s, v24.4s, v5.4s\n"
+ "prfm pldl1keep, [x22, #64]\n"
+ "fmla v11.4s, v24.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x16]\n"
+ "fmla v10.4s, v24.4s, v20.4s\n"
+ "ldr s24, [x22]\n"
+ "fmla v8.4s, v27.4s, v2.4s\n"
+ "prfm pldl1keep, [x9, x10]\n"
+ "fmla v16.4s, v27.4s, v19.4s\n"
+ "prfm pldl1keep, [x24, x21]\n"
+ "fmla v14.4s, v27.4s, v20.4s\n"
+ "ldr s26, [x20, %[input_col_stride1]]\n"
+ "fmla v12.4s, v22.4s, v1.4s\n"
+ "prfm pldl1keep, [%[inptr0], x18]\n"
+ "fmla v8.4s, v22.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, #64]\n"
+ "fmla v11.4s, v22.4s, v2.4s\n"
+ "prfm pldl1keep, [x22, x16]\n"
+ "fmla v16.4s, v22.4s, v6.4s\n"
+ "prfm pldl1keep, [x20, x10]\n"
+ "fmla v7.4s, v22.4s, v19.4s\n"
+ "prfm pldl1keep, [x9, x21]\n"
+ "fmla v15.4s, v22.4s, v20.4s\n"
+ "ldr s30, [x9, x13]\n"
+ "fmla v12.4s, v21.4s, v3.4s\n"
+ "prfm pldl1keep, [x24, x18]\n"
+ "fmla v8.4s, v21.4s, v5.4s\n"
+ "prfm pldl1keep, [%[inptr0], x12]\n"
+ "fmla v11.4s, v21.4s, v4.4s\n"
+ "prfm pldl1keep, [x23, x16]\n"
+ "fmla v7.4s, v21.4s, v6.4s\n"
+ "prfm pldl1keep, [x22, x10]\n"
+ "fmla v10.4s, v21.4s, v19.4s\n"
+ "prfm pldl1keep, [x20, x21]\n"
+ "fmla v17.4s, v21.4s, v20.4s\n"
+ "ldr s22, [x24, x19]\n"
+ "fmla v11.4s, v25.4s, v5.4s\n"
+ "prfm pldl1keep, [x9, x18]\n"
+ "fmla v10.4s, v25.4s, v6.4s\n"
+ "prfm pldl1keep, [x24, x12]\n"
+ "fmla v9.4s, v25.4s, v20.4s\n"
+ "ldr s21, [%[inptr0], x17]\n"
+ "fmla v16.4s, v24.4s, v2.4s\n"
+ "prfm pldl1keep, [x23, x10]\n"
+ "fmla v14.4s, v24.4s, v19.4s\n"
+ "ldr s24, [x23]\n"
+ "fmla v8.4s, v26.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x21]\n"
+ "fmla v16.4s, v26.4s, v4.4s\n"
+ "prfm pldl1keep, [x20, x18]\n"
+ "fmla v7.4s, v26.4s, v2.4s\n"
+ "prfm pldl1keep, [x9, x12]\n"
+ "fmla v14.4s, v26.4s, v6.4s\n"
+ "prfm pldl1keep, [x23, x21]\n"
+ "fmla v15.4s, v26.4s, v19.4s\n"
+ "prfm pldl1keep, [x22, x18]\n"
+ "fmla v13.4s, v26.4s, v20.4s\n"
+ "ldr s26, [x22, %[input_col_stride1]]\n"
+ "fmla v12.4s, v30.4s, v0.4s\n"
+ "prfm pldl1keep, [x20, x12]\n"
+ "fmla v8.4s, v30.4s, v3.4s\n"
+ "prfm pldl1keep, [x23, x18]\n"
+ "fmla v11.4s, v30.4s, v1.4s\n"
+ "prfm pldl1keep, [x22, x12]\n"
+ "fmla v16.4s, v30.4s, v5.4s\n"
+ "prfm pldl1keep, [x23, x12]\n"
+ "fmla v7.4s, v30.4s, v4.4s\n"
+ "add %[wbptr], %[wbptr], #40\n"
+ "fmla v10.4s, v30.4s, v2.4s\n"
+ "prfm pldl1keep, [%[wbptr], #64]\n"
+ "fmla v15.4s, v30.4s, v6.4s\n"
+ "fmla v17.4s, v30.4s, v19.4s\n"
+ "fmla v18.4s, v30.4s, v20.4s\n"
+ "ldr s27, [x20, x13]\n"
+ "fmla v11.4s, v22.4s, v3.4s\n"
+ "fmla v7.4s, v22.4s, v5.4s\n"
+ "fmla v10.4s, v22.4s, v4.4s\n"
+ "fmla v17.4s, v22.4s, v6.4s\n"
+ "fmla v9.4s, v22.4s, v19.4s\n"
+ "fmla v14.4s, v24.4s, v2.4s\n"
+ "mov v25.16b, v23.16b\n"
+ "fmla v16.4s, v26.4s, v1.4s\n"
+ "fmla v10.4s, v21.4s, v5.4s\n"
+ "fmla v15.4s, v26.4s, v2.4s\n"
+ "fmla v25.4s, v22.4s, v20.4s\n"
+ "ldr s28, [x9, x19]\n"
+ "fmla v9.4s, v21.4s, v6.4s\n"
+ "ldr s29, [x24, x17]\n"
+ "fmla v14.4s, v26.4s, v4.4s\n"
+ "fmla v13.4s, v26.4s, v19.4s\n"
+ "mov v22.16b, v23.16b\n"
+ "fmla v8.4s, v27.4s, v0.4s\n"
+ "fmla v16.4s, v27.4s, v3.4s\n"
+ "fmla v7.4s, v27.4s, v1.4s\n"
+ "fmla v14.4s, v27.4s, v5.4s\n"
+ "fmla v15.4s, v27.4s, v4.4s\n"
+ "fmla v17.4s, v27.4s, v2.4s\n"
+ "fmla v13.4s, v27.4s, v6.4s\n"
+ "fmla v18.4s, v27.4s, v19.4s\n"
+ "fmla v22.4s, v27.4s, v20.4s\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v21.16b, v23.16b\n"
+ "fmla v11.4s, v28.4s, v0.4s\n"
+ "fmla v7.4s, v28.4s, v3.4s\n"
+ "fmla v10.4s, v28.4s, v1.4s\n"
+ "fmla v15.4s, v28.4s, v5.4s\n"
+ "fmla v17.4s, v28.4s, v4.4s\n"
+ "fmla v9.4s, v28.4s, v2.4s\n"
+ "fmla v18.4s, v28.4s, v6.4s\n"
+ "fmla v25.4s, v28.4s, v19.4s\n"
+ "fmla v24.4s, v28.4s, v20.4s\n"
+ "ldr s23, [%[inptr0], x11]\n"
+ "fmla v10.4s, v29.4s, v3.4s\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v17.4s, v29.4s, v5.4s\n"
+ "fmla v9.4s, v29.4s, v4.4s\n"
+ "fmla v25.4s, v29.4s, v6.4s\n"
+ "ldr s30, [x23, %[input_col_stride1]]\n"
+ "fmla v14.4s, v30.4s, v1.4s\n"
+ "fmla v13.4s, v30.4s, v2.4s\n"
+ "fmla v9.4s, v23.4s, v5.4s\n"
+ "ldr s23, [x22, x13]\n"
+ "fmla v16.4s, v23.4s, v0.4s\n"
+ "ldr s29, [x20, x19]\n"
+ "fmla v14.4s, v23.4s, v3.4s\n"
+ "fmla v15.4s, v23.4s, v1.4s\n"
+ "fmla v13.4s, v23.4s, v4.4s\n"
+ "fmla v18.4s, v23.4s, v2.4s\n"
+ "fmla v22.4s, v23.4s, v19.4s\n"
+ "ldr s23, [x9, x17]\n"
+ "fmla v7.4s, v29.4s, v0.4s\n"
+ "fmla v15.4s, v29.4s, v3.4s\n"
+ "fmla v17.4s, v29.4s, v1.4s\n"
+ "fmla v13.4s, v29.4s, v5.4s\n"
+ "fmla v18.4s, v29.4s, v4.4s\n"
+ "fmla v25.4s, v29.4s, v2.4s\n"
+ "fmla v22.4s, v29.4s, v6.4s\n"
+ "fmla v24.4s, v29.4s, v19.4s\n"
+ "fmla v21.4s, v29.4s, v20.4s\n"
+ "ldr s26, [x24, x11]\n"
+ "fmla v10.4s, v23.4s, v0.4s\n"
+ "ldr s28, [x23, x13]\n"
+ "fmla v17.4s, v23.4s, v3.4s\n"
+ "add x24, x24, #4\n"
+ "fmla v9.4s, v23.4s, v1.4s\n"
+ "fmla v18.4s, v23.4s, v5.4s\n"
+ "fmla v25.4s, v23.4s, v4.4s\n"
+ "fmla v24.4s, v23.4s, v6.4s\n"
+ "fmla v14.4s, v28.4s, v0.4s\n"
+ "ldr s20, [x22, x19]\n"
+ "fmla v9.4s, v26.4s, v3.4s\n"
+ "fmla v13.4s, v28.4s, v1.4s\n"
+ "fmla v25.4s, v26.4s, v5.4s\n"
+ "ldr s26, [x20, x17]\n"
+ "fmla v22.4s, v28.4s, v2.4s\n"
+ "ldr s23, [x9, x11]\n"
+ "fmla v15.4s, v20.4s, v0.4s\n"
+ "add x9, x9, #4\n"
+ "fmla v13.4s, v20.4s, v3.4s\n"
+ "fmla v18.4s, v20.4s, v1.4s\n"
+ "fmla v22.4s, v20.4s, v4.4s\n"
+ "fmla v24.4s, v20.4s, v2.4s\n"
+ "fmla v21.4s, v20.4s, v19.4s\n"
+ "ldr s27, [x23, x19]\n"
+ "fmla v17.4s, v26.4s, v0.4s\n"
+ "ldr s20, [x22, x17]\n"
+ "fmla v18.4s, v26.4s, v3.4s\n"
+ "fmla v25.4s, v26.4s, v1.4s\n"
+ "fmla v22.4s, v26.4s, v5.4s\n"
+ "fmla v24.4s, v26.4s, v4.4s\n"
+ "fmla v21.4s, v26.4s, v6.4s\n"
+ "ldr s19, [x20, x11]\n"
+ "fmla v9.4s, v23.4s, v0.4s\n"
+ "ldr s28, [x23, x17]\n"
+ "fmla v25.4s, v23.4s, v3.4s\n"
+ "add x20, x20, #4\n"
+ "fmla v24.4s, v23.4s, v5.4s\n"
+ "ldr s29, [x22, x11]\n"
+ "fmla v13.4s, v27.4s, v0.4s\n"
+ "add x22, x22, #4\n"
+ "fmla v22.4s, v27.4s, v1.4s\n"
+ "fmla v21.4s, v27.4s, v2.4s\n"
+ "fmla v18.4s, v20.4s, v0.4s\n"
+ "ldr s30, [x23, x11]\n"
+ "fmla v24.4s, v20.4s, v1.4s\n"
+ "add x23, x23, #4\n"
+ "fmla v22.4s, v20.4s, v3.4s\n"
+ "fmla v21.4s, v20.4s, v4.4s\n"
+ "fmla v25.4s, v19.4s, v0.4s\n"
+ "movi v26.16b, #0\n"
+ "fmla v24.4s, v19.4s, v3.4s\n"
+ "fmov v27.4s, #6.0\n"
+ "fmla v21.4s, v19.4s, v5.4s\n"
+ "fmla v22.4s, v28.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v11.4s, v11.4s, v26.4s\n"
+ "fmla v24.4s, v29.4s, v0.4s\n"
+ "fmax v10.4s, v10.4s, v26.4s\n"
+ "fmla v21.4s, v28.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v27.4s\n"
+ "fmin v11.4s, v11.4s, v27.4s\n"
+ "fmin v10.4s, v10.4s, v27.4s\n"
+ "str s12, [%[outptr0]]\n"
+ "fmax v9.4s, v9.4s, v26.4s\n"
+ "str s11, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v21.4s, v29.4s, v3.4s\n"
+ "str s10, [%[outptr0], x27]\n"
+ "fmin v9.4s, v9.4s, v27.4s\n"
+ "fmax v8.4s, v8.4s, v26.4s\n"
+ "fmax v7.4s, v7.4s, v26.4s\n"
+ "str s9, [%[outptr0], x28]\n"
+ "fmla v21.4s, v30.4s, v0.4s\n"
+ "fmin v8.4s, v8.4s, v27.4s\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "fmin v7.4s, v7.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "str s8, [x8]\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "str s7, [x8, %[output_col_stride1]]\n"
+ "fmin v17.4s, v17.4s, v27.4s\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "str s17, [x8, x27]\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "str s25, [x8, x28]\n"
+ "fmin v16.4s, v16.4s, v27.4s\n"
+ "fmin v15.4s, v15.4s, v27.4s\n"
+ "add x8, x8, #4\n"
+ "str s16, [x25]\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "str s15, [x25, %[output_col_stride1]]\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "fmin v18.4s, v18.4s, v27.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "str s18, [x25, x27]\n"
+ "fmin v14.4s, v14.4s, v27.4s\n"
+ "str s24, [x25, x28]\n"
+ "fmin v13.4s, v13.4s, v27.4s\n"
+ "str s14, [x26]\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
+ "str s13, [x26, %[output_col_stride1]]\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
+ "add x25, x25, #4\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "str s22, [x26, x27]\n"
+ "str s21, [x26, x28]\n"
+ "add x26, x26, #4\n"
+ "7:\n"
+ : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
+ : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
+ );
}
#endif // __aarch64__
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
deleted file mode 100644
index 8f22a64..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_u8_s32.hpp"
-
-namespace depthwise
-{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp
deleted file mode 100644
index 09722d0..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_fp16_fp16.hpp"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace depthwise
-{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
-} // namespace depthwise
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
index 05315ee..a04609d 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,142 +25,5 @@
namespace depthwise
{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float, float>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>;
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>;
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
deleted file mode 100644
index cf51550..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "impl_u8_s32.hpp"
-
-namespace depthwise
-{
-using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
-using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
-
-template <>
-const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
-
-template <>
-const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
- ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
- },
- {
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
- ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
- },
-};
-
-template <>
-const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
-
-template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
new file mode 100644
index 0000000..692086c
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+// TODO Move to common utilities somewhere
+template <size_t Size> struct DType { };
+template <> struct DType<1> { using scalar_type = uint8_t; };
+template <> struct DType<2> { using scalar_type = uint16_t; };
+template <> struct DType<4> { using scalar_type = uint32_t; };
+
+namespace depthwise
+{
+
+template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
+void PackParameters<KernelRows, KernelColumns, WeightSize, BiasSize>::execute(
+ unsigned int n_channels,
+ void *buffer,
+ const void *weights,
+ const unsigned int weight_row_stride,
+ const unsigned int weight_col_stride,
+ const void *biases
+)
+{
+ using TWeight = typename DType<WeightSize>::scalar_type;
+ using TBias = typename DType<BiasSize>::scalar_type;
+
+ auto buffer_ptr = static_cast<uint8_t *>(buffer);
+ auto weights_ptr = static_cast<const TWeight *>(weights);
+ auto biases_ptr = static_cast<const TBias *>(biases);
+
+ const unsigned int veclen = 16 / WeightSize;
+ for (; n_channels >= veclen; n_channels -= veclen)
+ {
+ // Copy biases
+ for (unsigned int i = 0; i < veclen; i++)
+ {
+ auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
+ *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
+ buffer_ptr += BiasSize;
+ }
+
+ // Copy weights
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelColumns; j++)
+ {
+ for (unsigned int c = 0; c < veclen; c++)
+ {
+ *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride + c];
+ buffer_ptr += WeightSize;
+ }
+ }
+ }
+ weights_ptr += veclen;
+ }
+ for (; n_channels; n_channels--)
+ {
+ // Copy bias
+ auto ptr = reinterpret_cast<TBias *>(buffer_ptr);
+ *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++);
+ buffer_ptr += BiasSize;
+
+ // Copy weights
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelColumns; j++)
+ {
+ *(reinterpret_cast<TWeight *>(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride];
+ buffer_ptr += WeightSize;
+ }
+ }
+ weights_ptr++;
+ }
+}
+
+template struct PackParameters<3, 3, 2ul, 2ul>;
+template struct PackParameters<3, 3, 4ul, 4ul>;
+} // namespace
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
new file mode 100644
index 0000000..1989f87
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_qa8_qa8.hpp"
+
+namespace depthwise
+{
+template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>;
+template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>;
+} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
index dacfb24..cbdb19a 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,254 +35,206 @@
#pragma once
+using namespace neon_convolution_kernels;
+
namespace depthwise
{
-// Partial specialisation for FP16 to FP16
-template <int OutputTileRows, int OutputTileCols,
- int KernelRows, int KernelCols,
- int StrideRows, int StrideCols>
-struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float16_t, float16_t>
-{
- typedef DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float16_t, float16_t
- > DWC;
- template <
- bool Specialize=false, // Specialize (or not) the method
- int InPadTop=0, // If specialized, top padding
- int InPadLeft=0, // If specialized, left padding
- int InPadBottom=0, // If specialized, bottom padding
- int InPadRight=0, // If specialized, right padding
- int OutPadBottom=0, // If specialized, bottom output padding
- int OutPadRight=0 // If specialized, bottom right padding
- >
- static void process_tile(
- const int n_channels,
- const float16_t* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const float16_t* const inptr,
- const int in_row_stride,
- const int in_col_stride,
- float16_t* const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int in_pad_top=0,
- const int in_pad_left=0,
- const int in_pad_bottom=0,
- const int in_pad_right=0,
- const int out_pad_bottom=0,
- const int out_pad_right=0,
- const int input_offset=0,
- const int weights_offset=0
- );
-};
-
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC>
template <
- bool Specialize,
- int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
- int OutPadBottom, int OutPadRight
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
>
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float16_t, float16_t>::process_tile(
- const int n_channels,
- const float16_t *__restrict__ const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const float16_t *__restrict__ const inptr,
- const int in_row_stride,
- const int in_col_stride,
- float16_t *__restrict__ const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int _in_pad_top,
- const int _in_pad_left,
- const int _in_pad_bottom,
- const int _in_pad_right,
- const int _out_pad_bottom,
- const int _out_pad_right,
- const int _input_offset,
- const int _weights_offset
+DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+>::DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+) : Base(
+ n_batches, n_input_rows, n_input_cols, n_channels, activation,
+ padding_top, padding_left, padding_bottom, padding_right
+ )
+{
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+>::execute_tile(
+ int n_channels,
+ const void *weights_biases_ptr,
+ const float16_t *input,
+ const unsigned int in_row_stride,
+ const unsigned int in_col_stride,
+ float16_t *output,
+ const unsigned int out_row_stride,
+ const unsigned int out_col_stride
)
{
- constexpr auto inner_tile_rows = DWC::inner_tile_rows;
- constexpr auto inner_tile_cols = DWC::inner_tile_cols;
- constexpr auto kernel_rows = DWC::kernel_rows;
- constexpr auto kernel_cols = DWC::kernel_cols;
- constexpr auto output_tile_rows = DWC::output_tile_rows;
- constexpr auto output_tile_cols = DWC::output_tile_cols;
- constexpr auto stride_rows = DWC::stride_rows;
- constexpr auto stride_cols = DWC::stride_cols;
-
- // Extract parameters
- const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
- const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
- const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
- const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
- const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
- const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
- // Compute valid ranges of the tile
- const int in_cells_i = inner_tile_rows - in_pad_bottom;
- const int in_cells_j = inner_tile_cols - in_pad_right;
- const int out_cells_i = output_tile_rows - out_pad_bottom;
- const int out_cells_j = output_tile_cols - out_pad_right;
-
// Instantiate pointers
- const float16_t* __restrict__ inptr_base = inptr;
- const float16_t* __restrict__ wptr_base = weights;
- float16_t* __restrict__ outptr_base = outptr;
+ const float16_t* __restrict__ inptr_base = input;
+ float16_t* __restrict__ outptr_base = output;
+ const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
// Perform the depthwise convolution
int channels_remaining = n_channels;
-#ifdef __aarch64__
for (; channels_remaining >= 8; channels_remaining -= 8)
{
// Load input tile
- float16x8_t u[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
+ float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
{
- const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
- for (int j = 0; j < inner_tile_cols; j++)
+ const float16_t* const inptr_row = inptr_base + i*in_row_stride;
+ for (int j = 0; j < Base::inner_tile_cols; j++)
{
- if (i < in_pad_top || in_cells_i <= i ||
- j < in_pad_left || in_cells_j <= j)
- {
- u[i][j] = vdupq_n_f16(0.0f);
- }
- else
- {
- u[i][j] = vld1q_f16(inptr_row + (j - in_pad_left)*in_col_stride);
- }
+ u[i][j] = vld1q_f16(inptr_row + j*in_col_stride);
}
}
inptr_base += 8;
// Load weights tile
- float16x8_t w[kernel_rows][kernel_cols];
- for (int i = 0; i < kernel_rows; i++)
+ float16x8_t vbias = vld1q_f16(params);
+ params += 8;
+
+ float16x8_t w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
{
- const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
- for (int j = 0; j < kernel_cols; j++)
+ for (unsigned int j = 0; j < KernelCols; j++)
{
- w[i][j] = vld1q_f16(wptr_row + j*weight_col_stride);
+ w[i][j] = vld1q_f16(params);
+ params += 8;
}
}
- wptr_base += 8;
// Perform the convolution
- float16x8_t v[output_tile_rows][output_tile_cols];
- for (int out_i = 0; out_i < out_cells_i; out_i++)
+ float16x8_t v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
{
- for (int out_j = 0; out_j < out_cells_j; out_j++)
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
{
+ v[out_i][out_j] = vbias;
+
// Base co-ordinate
- const int base_i = out_i * stride_rows;
- const int base_j = out_j * stride_cols;
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
// Fill the accumulator
- for (int in_i = 0; in_i < kernel_rows; in_i++)
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
{
- const int i = base_i + in_i;
- for (int in_j = 0; in_j < kernel_cols; in_j++)
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
{
- const int j = base_j + in_j;
- if (in_i == 0 && in_j == 0)
- {
- // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
- v[out_i][out_j] = vmulq_f16(w[in_i][in_j], u[i][j]);
- }
- else
- {
- // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
- }
+ const unsigned int j = base_j + in_j;
+
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
}
}
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
+ }
}
}
// Store the output tile
- for (int i = 0; i < out_cells_i; i++)
+ for (unsigned int i = 0; i < OutputTileRows; i++)
{
float16_t* const outptr_row = outptr_base + i*out_row_stride;
- for (int j = 0; j < out_cells_j; j++)
+ for (unsigned int j = 0; j < OutputTileCols; j++)
{
vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
}
}
outptr_base += 8;
}
-#endif // __aarch64__
for (; channels_remaining; channels_remaining--)
{
// Load input tile
- float16_t u[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
+ float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
{
- const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
- for (int j = 0; j < inner_tile_cols; j++)
+ const float16_t* const inptr_row = inptr_base + i*in_row_stride;
+ for (int j = 0; j < Base::inner_tile_cols; j++)
{
- if (i < in_pad_top || in_cells_i <= i ||
- j < in_pad_left || in_cells_j <= j)
- {
- u[i][j] = static_cast<float16_t>(0);
- }
- else
- {
- u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
- }
+ u[i][j] = *(inptr_row + j*in_col_stride);
}
}
inptr_base++;
// Load weights tile
- float16_t w[kernel_rows][kernel_cols];
- for (int i = 0; i < kernel_rows; i++)
+ float16_t bias = *(params++);
+ float16_t w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
{
- const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
- for (int j = 0; j < kernel_cols; j++)
+ for (unsigned int j = 0; j < KernelCols; j++)
{
- w[i][j] = *(wptr_row + j*weight_col_stride);
+ w[i][j] = *(params++);
}
}
- wptr_base++;
// Perform the convolution
- float16_t v[output_tile_rows][output_tile_cols];
- for (int out_i = 0; out_i < out_cells_i; out_i++)
+ float16_t v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
{
- for (int out_j = 0; out_j < out_cells_j; out_j++)
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
{
// Clear the accumulator
- v[out_i][out_j] = static_cast<float16_t>(0);
+ v[out_i][out_j] = bias;
// Base co-ordinate
- const int base_i = out_i * stride_rows;
- const int base_j = out_j * stride_cols;
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
// Fill the accumulator
- for (int in_i = 0; in_i < kernel_rows; in_i++)
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
{
- const int i = base_i + in_i;
- for (int in_j = 0; in_j < kernel_cols; in_j++)
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
{
const int j = base_j + in_j;
v[out_i][out_j] += w[in_i][in_j] * u[i][j];
}
}
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
+ }
}
}
// Store the output tile
- for (int i = 0; i < out_cells_i; i++)
+ for (unsigned int i = 0; i < OutputTileRows; i++)
{
float16_t* const outptr_row = outptr_base + i*out_row_stride;
- for (int j = 0; j < out_cells_j; j++)
+ for (unsigned int j = 0; j < OutputTileCols; j++)
{
*(outptr_row + j*out_col_stride) = v[i][j];
}
@@ -290,5 +242,173 @@
outptr_base++;
}
}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+>::execute_tile(
+ int n_channels,
+ const void *weights_biases_ptr,
+ const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ // Instantiate pointers
+ const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
+ int n = 0;
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+ for (; channels_remaining >= 8; channels_remaining -= 8, n += 8)
+ {
+ // Load input tile
+ float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ u[i][j] = vld1q_f16(inptrs[i][j] + n);
+ }
+ }
+
+ // Load weights tile
+ float16x8_t vbias = vld1q_f16(params);
+ params += 8;
+
+ float16x8_t w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ w[i][j] = vld1q_f16(params);
+ params += 8;
+ }
+ }
+
+ // Perform the convolution
+ float16x8_t v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+ {
+ v[out_i][out_j] = vbias;
+
+ // Base co-ordinate
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
+
+ // Fill the accumulator
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+ {
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+ {
+ const unsigned int j = base_j + in_j;
+
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
+ }
+ }
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
+ }
+ }
+ }
+
+ // Store the output tile
+ for (unsigned int i = 0; i < OutputTileRows; i++)
+ {
+ for (unsigned int j = 0; j < OutputTileCols; j++)
+ {
+ vst1q_f16(outptrs[i][j] + n, v[i][j]);
+ }
+ }
+ }
+ for (; channels_remaining; channels_remaining--, n++)
+ {
+ // Load input tile
+ float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ u[i][j] = *(inptrs[i][j] + n);
+ }
+ }
+
+ // Load weights tile
+ float16_t bias = *(params++);
+ float16_t w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ w[i][j] = *(params++);
+ }
+ }
+
+ // Perform the convolution
+ float16_t v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = bias;
+
+ // Base co-ordinate
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
+
+ // Fill the accumulator
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+ {
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
+ }
+ }
+ }
+
+ // Store the output tile
+ for (unsigned int i = 0; i < OutputTileRows; i++)
+ {
+ for (unsigned int j = 0; j < OutputTileCols; j++)
+ {
+ *(outptrs[i][j] + n) = v[i][j];
+ }
+ }
+ }
+}
+
} // namespace depthwise
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
index 840086f..2645761 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,254 +35,207 @@
#pragma once
+using namespace neon_convolution_kernels;
+
namespace depthwise
{
-// Partial specialisation for FP32 to FP32
-template <int OutputTileRows, int OutputTileCols,
- int KernelRows, int KernelCols,
- int StrideRows, int StrideCols>
-struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float, float>
-{
- typedef DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- float, float
- > DWC;
- template <
- bool Specialize=false, // Specialize (or not) the method
- int InPadTop=0, // If specialized, top padding
- int InPadLeft=0, // If specialized, left padding
- int InPadBottom=0, // If specialized, bottom padding
- int InPadRight=0, // If specialized, right padding
- int OutPadBottom=0, // If specialized, bottom output padding
- int OutPadRight=0 // If specialized, bottom right padding
- >
- static void process_tile(
- const int n_channels,
- const float* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const float* const inptr,
- const int in_row_stride,
- const int in_col_stride,
- float* const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int in_pad_top=0,
- const int in_pad_left=0,
- const int in_pad_bottom=0,
- const int in_pad_right=0,
- const int out_pad_bottom=0,
- const int out_pad_right=0,
- const int input_offset=0,
- const int weights_offset=0
- );
-};
-
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC>
template <
- bool Specialize,
- int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
- int OutPadBottom, int OutPadRight
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
>
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_tile(
- const int n_channels,
- const float *__restrict__ const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const float *__restrict__ const inptr,
- const int in_row_stride,
- const int in_col_stride,
- float *__restrict__ const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int _in_pad_top,
- const int _in_pad_left,
- const int _in_pad_bottom,
- const int _in_pad_right,
- const int _out_pad_bottom,
- const int _out_pad_right,
- const int _input_offset,
- const int _weights_offset
+DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float, float, float
+>::DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+) : Base(
+ n_batches, n_input_rows, n_input_cols, n_channels, activation,
+ padding_top, padding_left, padding_bottom, padding_right
+ )
+{
+}
+
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float, float, float
+>::execute_tile(
+ int n_channels,
+ const void *weights_biases_ptr,
+ const float *input,
+ const unsigned int in_row_stride,
+ const unsigned int in_col_stride,
+ float *output,
+ const unsigned int out_row_stride,
+ const unsigned int out_col_stride
)
{
- constexpr auto inner_tile_rows = DWC::inner_tile_rows;
- constexpr auto inner_tile_cols = DWC::inner_tile_cols;
- constexpr auto kernel_rows = DWC::kernel_rows;
- constexpr auto kernel_cols = DWC::kernel_cols;
- constexpr auto output_tile_rows = DWC::output_tile_rows;
- constexpr auto output_tile_cols = DWC::output_tile_cols;
- constexpr auto stride_rows = DWC::stride_rows;
- constexpr auto stride_cols = DWC::stride_cols;
-
- // Extract parameters
- const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
- const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
- const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
- const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
- const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
- const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
- // Compute valid ranges of the tile
- const int in_cells_i = inner_tile_rows - in_pad_bottom;
- const int in_cells_j = inner_tile_cols - in_pad_right;
- const int out_cells_i = output_tile_rows - out_pad_bottom;
- const int out_cells_j = output_tile_cols - out_pad_right;
-
// Instantiate pointers
- const float* __restrict__ inptr_base = inptr;
- const float* __restrict__ wptr_base = weights;
- float* __restrict__ outptr_base = outptr;
+ const float* __restrict__ inptr_base = input;
+ float* __restrict__ outptr_base = output;
+ const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
// Perform the depthwise convolution
int channels_remaining = n_channels;
-#ifdef __aarch64__
for (; channels_remaining >= 4; channels_remaining -= 4)
{
// Load input tile
- float32x4_t u[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
+ float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
{
- const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
- for (int j = 0; j < inner_tile_cols; j++)
+ const float* const inptr_row = inptr_base + i*in_row_stride;
+ for (int j = 0; j < Base::inner_tile_cols; j++)
{
- if (i < in_pad_top || in_cells_i <= i ||
- j < in_pad_left || in_cells_j <= j)
- {
- u[i][j] = vdupq_n_f32(0.0f);
- }
- else
- {
- u[i][j] = vld1q_f32(inptr_row + (j - in_pad_left)*in_col_stride);
- }
+ u[i][j] = vld1q_f32(inptr_row + j*in_col_stride);
}
}
inptr_base += 4;
// Load weights tile
- float32x4_t w[kernel_rows][kernel_cols];
- for (int i = 0; i < kernel_rows; i++)
+ float32x4_t vbias = vld1q_f32(params);
+ params += 4;
+
+ float32x4_t w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
{
- const float* const wptr_row = wptr_base + i*weight_row_stride;
- for (int j = 0; j < kernel_cols; j++)
+ for (unsigned int j = 0; j < KernelCols; j++)
{
- w[i][j] = vld1q_f32(wptr_row + j*weight_col_stride);
+ w[i][j] = vld1q_f32(params);
+ params += 4;
}
}
- wptr_base += 4;
// Perform the convolution
- float32x4_t v[output_tile_rows][output_tile_cols];
- for (int out_i = 0; out_i < out_cells_i; out_i++)
+ float32x4_t v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
{
- for (int out_j = 0; out_j < out_cells_j; out_j++)
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
{
+ v[out_i][out_j] = vbias;
+
// Base co-ordinate
- const int base_i = out_i * stride_rows;
- const int base_j = out_j * stride_cols;
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
// Fill the accumulator
- for (int in_i = 0; in_i < kernel_rows; in_i++)
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
{
- const int i = base_i + in_i;
- for (int in_j = 0; in_j < kernel_cols; in_j++)
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
{
- const int j = base_j + in_j;
- if (in_i == 0 && in_j == 0)
- {
- // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
- v[out_i][out_j] = vmulq_f32(w[in_i][in_j], u[i][j]);
- }
- else
- {
- // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
- }
+ const unsigned int j = base_j + in_j;
+
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
}
}
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
+ }
}
}
// Store the output tile
- for (int i = 0; i < out_cells_i; i++)
+ for (unsigned int i = 0; i < OutputTileRows; i++)
{
float* const outptr_row = outptr_base + i*out_row_stride;
- for (int j = 0; j < out_cells_j; j++)
+ for (unsigned int j = 0; j < OutputTileCols; j++)
{
vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
}
}
outptr_base += 4;
}
-#endif // __aarch64__
for (; channels_remaining; channels_remaining--)
{
// Load input tile
- float u[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
+ float u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
{
- const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
- for (int j = 0; j < inner_tile_cols; j++)
+ const float* const inptr_row = inptr_base + i*in_row_stride;
+ for (int j = 0; j < Base::inner_tile_cols; j++)
{
- if (i < in_pad_top || in_cells_i <= i ||
- j < in_pad_left || in_cells_j <= j)
- {
- u[i][j] = static_cast<float>(0);
- }
- else
- {
- u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
- }
+ u[i][j] = *(inptr_row + j*in_col_stride);
}
}
inptr_base++;
// Load weights tile
- float w[kernel_rows][kernel_cols];
- for (int i = 0; i < kernel_rows; i++)
+ float bias = *(params++);
+ float w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
{
- const float* const wptr_row = wptr_base + i*weight_row_stride;
- for (int j = 0; j < kernel_cols; j++)
+ for (unsigned int j = 0; j < KernelCols; j++)
{
- w[i][j] = *(wptr_row + j*weight_col_stride);
+ w[i][j] = *(params++);
}
}
- wptr_base++;
// Perform the convolution
- float v[output_tile_rows][output_tile_cols];
- for (int out_i = 0; out_i < out_cells_i; out_i++)
+ float v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
{
- for (int out_j = 0; out_j < out_cells_j; out_j++)
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
{
// Clear the accumulator
- v[out_i][out_j] = static_cast<float>(0);
+ v[out_i][out_j] = bias;
// Base co-ordinate
- const int base_i = out_i * stride_rows;
- const int base_j = out_j * stride_cols;
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
// Fill the accumulator
- for (int in_i = 0; in_i < kernel_rows; in_i++)
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
{
- const int i = base_i + in_i;
- for (int in_j = 0; in_j < kernel_cols; in_j++)
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
{
const int j = base_j + in_j;
v[out_i][out_j] += w[in_i][in_j] * u[i][j];
}
}
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
+ }
}
}
// Store the output tile
- for (int i = 0; i < out_cells_i; i++)
+ for (unsigned int i = 0; i < OutputTileRows; i++)
{
float* const outptr_row = outptr_base + i*out_row_stride;
- for (int j = 0; j < out_cells_j; j++)
+ for (unsigned int j = 0; j < OutputTileCols; j++)
{
*(outptr_row + j*out_col_stride) = v[i][j];
}
@@ -291,4 +244,171 @@
}
}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+template <ActivationFunction Activation>
+void DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols, StrideRows, StrideCols,
+ float, float, float
+>::execute_tile(
+ int n_channels,
+ const void *weights_biases_ptr,
+ const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
+)
+{
+ const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+ int n = 0;
+ for (; channels_remaining >= 4; channels_remaining -= 4, n += 4)
+ {
+ // Load input tile
+ float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ u[i][j] = vld1q_f32(inptrs[i][j] + n);
+ }
+ }
+
+ // Load weights tile
+ float32x4_t vbias = vld1q_f32(params);
+ params += 4;
+
+ float32x4_t w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ w[i][j] = vld1q_f32(params);
+ params += 4;
+ }
+ }
+
+ // Perform the convolution
+ float32x4_t v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+ {
+ v[out_i][out_j] = vbias;
+
+ // Base co-ordinate
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
+
+ // Fill the accumulator
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+ {
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+ {
+ const unsigned int j = base_j + in_j;
+
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
+ }
+ }
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
+ }
+ }
+ }
+
+ // Store the output tile
+ for (unsigned int i = 0; i < OutputTileRows; i++)
+ {
+ for (unsigned int j = 0; j < OutputTileCols; j++)
+ {
+ vst1q_f32(outptrs[i][j] + n, v[i][j]);
+ }
+ }
+ }
+ for (; channels_remaining; channels_remaining--, n++)
+ {
+ // Load input tile
+ float u[Base::inner_tile_rows][Base::inner_tile_cols];
+ for (int i = 0; i < Base::inner_tile_rows; i++)
+ {
+ for (int j = 0; j < Base::inner_tile_cols; j++)
+ {
+ u[i][j] = *(inptrs[i][j] + n);
+ }
+ }
+
+ // Load weights tile
+ float bias = *(params++);
+ float w[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ w[i][j] = *(params++);
+ }
+ }
+
+ // Perform the convolution
+ float v[OutputTileRows][OutputTileCols];
+ for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = bias;
+
+ // Base co-ordinate
+ const int base_i = out_i * StrideRows;
+ const int base_j = out_j * StrideCols;
+
+ // Fill the accumulator
+ for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
+ {
+ const unsigned int i = base_i + in_i;
+ for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+
+ // Apply the activation function
+ if (Activation == ActivationFunction::ReLU ||
+ Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
+ }
+ if (Activation == ActivationFunction::ReLU6)
+ {
+ v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
+ }
+ }
+ }
+
+ // Store the output tile
+ for (unsigned int i = 0; i < OutputTileRows; i++)
+ {
+ for (unsigned int j = 0; j < OutputTileCols; j++)
+ {
+ *(outptrs[i][j] + n) = v[i][j];
+ }
+ }
+ }
+}
+
} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
new file mode 100644
index 0000000..5546d37
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ * NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include <limits>
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+
+#pragma once
+
+using namespace neon_convolution_kernels;
+using namespace qasymm8;
+
+template <typename T>
+inline T saturating_doubling_high_mul(const T&, const int32_t&);
+
+template <>
+inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
+{
+ return vqrdmulhq_n_s32(a, b);
+}
+
+template <>
+inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
+{
+ return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
+}
+
+template <typename T>
+inline T rounding_divide_by_exp2(const T& x, const int exponent);
+
+template <>
+inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
+{
+ const int32x4_t shift = vdupq_n_s32(-exponent);
+ const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
+ const int32x4_t fixed = vqaddq_s32(x, fixup);
+ return vrshlq_s32(fixed, shift);
+}
+
+template <>
+inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
+{
+ const int32x2_t shift = vdup_n_s32(-exponent);
+ const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
+ const int32x2_t fixed = vqadd_s32(x, fixup);
+ return vrshl_s32(fixed, shift);
+}
+
+template <>
+inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
+{
+ const int32x2_t xs = vdup_n_s32(x);
+ return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
+}
+
+namespace depthwise
+{
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QAsymm8DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ const ActivationFunction activation,
+ const QAsymm8Params& weight_quantisation,
+ const QAsymm8Params& input_quantisation,
+ const QAsymm8Params& output_quantisation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ ) : QAsymm8DepthwiseConvolution(
+ n_batches, n_input_rows, n_input_cols, n_channels,
+ activation, weight_quantisation, input_quantisation, output_quantisation,
+ QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
+ padding_top, padding_left, padding_bottom, padding_right
+)
+{
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::QAsymm8DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ const ActivationFunction activation,
+ const QAsymm8Params& weight_quantisation,
+ const QAsymm8Params& input_quantisation,
+ const QAsymm8Params& output_quantisation,
+ const QAsymm8RescaleParams& rescale_params,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ ) : Base(
+ n_batches, n_input_rows, n_input_cols, n_channels, activation,
+ padding_top, padding_left, padding_bottom, padding_right
+),
+ _weights_quant(weight_quantisation),
+ _inputs_quant(input_quantisation),
+ _output_quant(output_quantisation),
+ rescale_parameters(rescale_params)
+{
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+uint8_t QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::_input_padding_value(void) const
+{
+ return _inputs_quant.offset;
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+void QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::_pack_params(
+ void * const buffer,
+ const void * const weights,
+ const unsigned int weight_row_stride,
+ const unsigned int weight_col_stride,
+ const void * const biases
+ ) const
+{
+ const uint8_t *wptr = static_cast<const uint8_t *>(weights);
+ const int32_t *bptr = static_cast<const int32_t *>(biases);
+ uint8_t *outptr = static_cast<uint8_t *>(buffer);
+
+ // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE
+ // For SVE set this to half the vector length.
+ unsigned int veclen = 8;
+
+ // While there are channels left to process, pack a vector length of them at
+ // a time and reduce the size of vector used as the size of the tensor
+ // decreases.
+ for (
+ unsigned int n_channels = this->n_channels(); n_channels;
+ n_channels -= veclen,
+ outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
+ )
+ {
+ // NOTE Ignore this section if using SVE, the vector length remains the
+ // same and we just don't fill a full register for the tail.
+ while (n_channels < veclen)
+ {
+ // Reduce the vector length to either 8 or 1 (scalar)
+ // TODO Support more vector lengths in `execute_tile`.
+ veclen = (veclen == 16) ? 8 : 1;
+ }
+
+ // Get pointers to bias and weight portions of the output structure.
+ int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
+ uint8_t *out_wptr = outptr + veclen*sizeof(int32_t);
+
+ // Copy a vector length of elements
+ for (unsigned int n = 0; n < veclen && n < n_channels; n++)
+ {
+ const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
+ out_bptr[n] = bias;
+
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ uint8_t *row_outptr = out_wptr + i*KernelCols*veclen;
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
+ row_outptr[j*veclen + n] = w;
+ }
+ }
+ wptr++;
+ }
+ }
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols,
+ typename FInput, typename FOutput
+>
+static inline void tilefn(
+ int n_channels,
+ const void* packed_params,
+ FInput &get_input_ptr,
+ FOutput &get_output_ptr,
+ const int32_t clamp_max,
+ const int32_t clamp_min,
+ const uint8_t input_offset,
+ const uint8_t weight_offset,
+ const uint8_t output_offset,
+ const int32_t requant_multiplier,
+ const int32_t requant_shift
+ )
+{
+ constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
+ constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
+
+ // Offset into channels
+ int channel = 0;
+
+ // Byte type pointer to weights and biases
+ const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
+
+ for (; n_channels >= 8; n_channels -= 8, channel += 8)
+ {
+ const int32x4_t biases[2] = {
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
+ vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
+ };
+ wbptr += 8*sizeof(int32_t);
+
+ int16x8_t weights[KernelRows][KernelCols];
+ const uint8x8_t woffset = vdup_n_u8(weight_offset);
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ const uint8x8_t w = vld1_u8(wbptr);
+ weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset));
+ wbptr += 8;
+ }
+ }
+
+ int16x8_t inputs[InnerTileRows][InnerTileCols];
+ const uint8x8_t ioffset = vdup_n_u8(input_offset);
+ for (unsigned int i = 0; i < InnerTileRows; i++)
+ {
+ for (unsigned int j = 0; j < InnerTileCols; j++)
+ {
+ const auto x = vld1_u8(get_input_ptr(i, j, channel));
+ inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
+ }
+ }
+
+ for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+ {
+ for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+ {
+ int32x4_t accs[2];
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ accs[i] = biases[i];
+ }
+
+ for (unsigned int wi = 0; wi < KernelRows; wi++)
+ {
+ for (unsigned int wj = 0; wj < KernelCols; wj++)
+ {
+ const auto w = weights[wi][wj];
+ const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
+ accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
+ accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
+ }
+ }
+
+ int32x4_t final_accs[2];
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ const int32x4_t y = rounding_divide_by_exp2(
+ saturating_doubling_high_mul(accs[i], requant_multiplier),
+ requant_shift);
+ const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
+ final_accs[i] = vaddq_s32(y, offset);
+ final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
+ final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
+ }
+
+ const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
+ vreinterpretq_s16_s32(final_accs[1]));
+ const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
+ const uint8x8_t output =
+ vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
+ vst1_u8(get_output_ptr(oi, oj, channel), output);
+ }
+ }
+ }
+ for (; n_channels; n_channels--, channel++)
+ {
+ // Load bias
+ const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
+ wbptr += sizeof(int32_t);
+
+ // Load weights
+ int16_t weights[KernelRows][KernelCols];
+ for (unsigned int i = 0; i < KernelRows; i++)
+ {
+ for (unsigned int j = 0; j < KernelCols; j++)
+ {
+ weights[i][j] = *(wbptr++) - weight_offset;
+ }
+ }
+
+ // Load the input activations
+ int16_t inputs[InnerTileRows][InnerTileCols];
+ for (unsigned int i = 0; i < InnerTileRows; i++)
+ {
+ for (unsigned int j = 0; j < InnerTileCols; j++)
+ {
+ inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
+ }
+ }
+
+ // Perform the convolution
+ for (unsigned int oi = 0; oi < OutputTileRows; oi++)
+ {
+ for (unsigned int oj = 0; oj < OutputTileCols; oj++)
+ {
+ int32_t acc = bias;
+
+ for (unsigned int wi = 0; wi < KernelRows; wi++)
+ {
+ for (unsigned int wj = 0; wj < KernelCols; wj++)
+ {
+ const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
+ acc += w * x;
+ }
+ }
+
+ // Requantize
+ acc = rounding_divide_by_exp2(
+ saturating_doubling_high_mul(acc, requant_multiplier),
+ requant_shift);
+ acc += output_offset;
+ acc = std::max(acc, clamp_min);
+ acc = std::min(acc, clamp_max);
+ uint8_t output = static_cast<uint8_t>(acc);
+ *(get_output_ptr(oi, oj, channel)) = output;
+ }
+ }
+ }
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols,
+ typename FInput, typename FOutput
+>
+static inline void execute_tilefn(
+ int n_channels,
+ const void* packed_params,
+ const nck::ActivationFunction actfn,
+ FInput &get_input_ptr,
+ FOutput &get_output_ptr,
+ const QAsymm8Params &input_quant,
+ const QAsymm8Params &weight_quant,
+ const QAsymm8Params &output_quant,
+ const QAsymm8RescaleParams &requant
+ ) {
+ // Compute min/max clamp values
+ int32_t clamp_min = std::numeric_limits<uint8_t>::min();
+ int32_t clamp_max = std::numeric_limits<uint8_t>::max();
+
+ if (actfn == nck::ActivationFunction::ReLU ||
+ actfn == nck::ActivationFunction::ReLU6) {
+ const int32_t bottom_rail = output_quant.offset;
+ clamp_min = std::max(clamp_min, bottom_rail);
+ }
+
+ if (actfn == nck::ActivationFunction::ReLU6) {
+ const int32_t top_rail = output_quant.quantize(6.0f);
+ clamp_max = std::min(clamp_max, top_rail);
+ }
+
+ // Call the tile execution method
+ tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
+ StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr,
+ clamp_max, clamp_min, input_quant.offset,
+ weight_quant.offset, output_quant.offset,
+ requant.multiplier, requant.shift);
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+template <nck::ActivationFunction Activation>
+void QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const uint8_t* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ uint8_t* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ ) {
+ // Construct methods to get pointers
+ const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
+ const int i, const int j, const int channel) {
+ return inptr + i * in_row_stride + j * in_col_stride + channel;
+ };
+
+ const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
+ const int i, const int j, const int channel) {
+ return outptr + i * out_row_stride + j * out_col_stride + channel;
+ };
+
+ execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
+ StrideRows, StrideCols>(
+ n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
+ _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
+}
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+template <nck::ActivationFunction Activation>
+void QAsymm8DepthwiseConvolution<
+ OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
+>::execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+ uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+ ) {
+ // Construct methods to get pointers
+ const auto get_input_ptr = [inptrs](const int i, const int j,
+ const int channel) {
+ return inptrs[i][j] + channel;
+ };
+
+ const auto get_output_ptr = [outptrs](const int i, const int j,
+ const int channel) {
+ return outptrs[i][j] + channel;
+ };
+
+ // Call the tile execution method
+ execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
+ StrideRows, StrideCols>(
+ n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
+ _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
+}
+
+} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
deleted file mode 100644
index d0d8de5..0000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- * NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
-
-#pragma once
-
-namespace depthwise
-{
-// Partial specialisation for U8 to S32
-template <int OutputTileRows, int OutputTileCols,
- int KernelRows, int KernelCols,
- int StrideRows, int StrideCols>
-struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, uint8_t, int32_t>
-{
- typedef DepthwiseConvolution<
- OutputTileRows, OutputTileCols,
- KernelRows, KernelCols,
- StrideRows, StrideCols,
- uint8_t, int32_t
- > DWC;
-
- template <
- bool Specialize=false, // Specialize (or not) the method
- int InPadTop=0, // If specialized, top padding
- int InPadLeft=0, // If specialized, left padding
- int InPadBottom=0, // If specialized, bottom padding
- int InPadRight=0, // If specialized, right padding
- int OutPadBottom=0, // If specialized, bottom output padding
- int OutPadRight=0 // If specialized, bottom right padding
- >
- static void process_tile(
- const int n_channels,
- const uint8_t* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const uint8_t* const inptr,
- const int in_row_stride,
- const int in_col_stride,
- int32_t* const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int in_pad_top=0,
- const int in_pad_left=0,
- const int in_pad_bottom=0,
- const int in_pad_right=0,
- const int out_pad_bottom=0,
- const int out_pad_right=0,
- const int input_offset=0,
- const int weights_offset=0);
-};
-
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC>
-template <
- bool Specialize,
- int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
- int OutPadBottom, int OutPadRight
->
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, uint8_t, int32_t>::process_tile(
- const int n_channels,
- const uint8_t *__restrict__ const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const uint8_t *__restrict__ const inptr,
- const int in_row_stride,
- const int in_col_stride,
- int32_t *__restrict__ const outptr,
- const int out_row_stride,
- const int out_col_stride,
- const int _in_pad_top,
- const int _in_pad_left,
- const int _in_pad_bottom,
- const int _in_pad_right,
- const int _out_pad_bottom,
- const int _out_pad_right,
- const int _input_offset,
- const int _weights_offset
-)
-{
- constexpr auto inner_tile_rows = DWC::inner_tile_rows;
- constexpr auto inner_tile_cols = DWC::inner_tile_cols;
- constexpr auto kernel_rows = DWC::kernel_rows;
- constexpr auto kernel_cols = DWC::kernel_cols;
- constexpr auto output_tile_rows = DWC::output_tile_rows;
- constexpr auto output_tile_cols = DWC::output_tile_cols;
- constexpr auto stride_rows = DWC::stride_rows;
- constexpr auto stride_cols = DWC::stride_cols;
-
- // Extract parameters
- const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
- const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
- const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
- const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
- const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
- const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
- // Compute valid ranges of the tile
- const int in_cells_i = inner_tile_rows - in_pad_bottom;
- const int in_cells_j = inner_tile_cols - in_pad_right;
- const int out_cells_i = output_tile_rows - out_pad_bottom;
- const int out_cells_j = output_tile_cols - out_pad_right;
-
- // Instantiate pointers
- const uint8_t* __restrict__ inptr_base = inptr;
- const uint8_t* __restrict__ wptr_base = weights;
- int32_t* __restrict__ outptr_base = outptr;
-
- // Perform the depthwise convolution
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- const int32x4_t v_input_offset = vdupq_n_s32(_input_offset);
- const int32x4_t v_weights_offset = vdupq_n_s32(_weights_offset);
- for (; channels_remaining >= 16; channels_remaining -= 16)
- {
- // Load input tile
- int32x4x4_t u[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
- {
- const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
- for (int j = 0; j < inner_tile_cols; j++)
- {
- if (i < in_pad_top || in_cells_i <= i ||
- j < in_pad_left || in_cells_j <= j)
- {
- u[i][j].val[0] = vdupq_n_s32(0);
- u[i][j].val[1] = vdupq_n_s32(0);
- u[i][j].val[2] = vdupq_n_s32(0);
- u[i][j].val[3] = vdupq_n_s32(0);
- }
- else
- {
- const uint8x16_t uv = vld1q_u8(inptr_row + (j - in_pad_left)*in_col_stride);
- u[i][j].val[0] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(uv)))));
- u[i][j].val[1] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(uv)))));
- u[i][j].val[2] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(uv)))));
- u[i][j].val[3] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(uv)))));
- }
- }
- }
- inptr_base += 16;
-
- // Load weights tile
- int32x4x4_t w[kernel_rows][kernel_cols];
- for (int i = 0; i < kernel_rows; i++)
- {
- const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
- for (int j = 0; j < kernel_cols; j++)
- {
- const uint8x16_t wv = vld1q_u8(wptr_row + j*weight_col_stride);
- w[i][j].val[0] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(wv)))));
- w[i][j].val[1] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(wv)))));
- w[i][j].val[2] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(wv)))));
- w[i][j].val[3] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(wv)))));
- }
- }
- wptr_base += 16;
-
- // Perform the convolution
- int32x4x4_t v[output_tile_rows][output_tile_cols];
- for (int out_i = 0; out_i < out_cells_i; out_i++)
- {
- for (int out_j = 0; out_j < out_cells_j; out_j++)
- {
- // Base co-ordinate
- const int base_i = out_i * stride_rows;
- const int base_j = out_j * stride_cols;
-
- // Fill the accumulator
- for (int in_i = 0; in_i < kernel_rows; in_i++)
- {
- const int i = base_i + in_i;
- for (int in_j = 0; in_j < kernel_cols; in_j++)
- {
- const int j = base_j + in_j;
- if (in_i == 0 && in_j == 0)
- {
- // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
- v[out_i][out_j].val[0] = vmulq_s32(w[in_i][in_j].val[0], u[i][j].val[0]);
- v[out_i][out_j].val[1] = vmulq_s32(w[in_i][in_j].val[1], u[i][j].val[1]);
- v[out_i][out_j].val[2] = vmulq_s32(w[in_i][in_j].val[2], u[i][j].val[2]);
- v[out_i][out_j].val[3] = vmulq_s32(w[in_i][in_j].val[3], u[i][j].val[3]);
- }
- else
- {
- // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- v[out_i][out_j].val[0] = vmlaq_s32(v[out_i][out_j].val[0], w[in_i][in_j].val[0], u[i][j].val[0]);
- v[out_i][out_j].val[1] = vmlaq_s32(v[out_i][out_j].val[1], w[in_i][in_j].val[1], u[i][j].val[1]);
- v[out_i][out_j].val[2] = vmlaq_s32(v[out_i][out_j].val[2], w[in_i][in_j].val[2], u[i][j].val[2]);
- v[out_i][out_j].val[3] = vmlaq_s32(v[out_i][out_j].val[3], w[in_i][in_j].val[3], u[i][j].val[3]);
- }
- }
- }
- }
- }
-
- // Store the output tile
- for (int i = 0; i < out_cells_i; i++)
- {
- int32_t* const outptr_row = outptr_base + i*out_row_stride;
- for (int j = 0; j < out_cells_j; j++)
- {
- vst1q_s32(outptr_row + j*out_col_stride, v[i][j].val[0]);
- vst1q_s32(outptr_row + j*out_col_stride + 4, v[i][j].val[1]);
- vst1q_s32(outptr_row + j*out_col_stride + 8, v[i][j].val[2]);
- vst1q_s32(outptr_row + j*out_col_stride + 12, v[i][j].val[3]);
- }
- }
- outptr_base += 16;
- }
-#endif // __aarch64__
- for (; channels_remaining; channels_remaining--)
- {
- // Load input tile
- int32_t u[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
- {
- const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
- for (int j = 0; j < inner_tile_cols; j++)
- {
- if (i < in_pad_top || in_cells_i <= i ||
- j < in_pad_left || in_cells_j <= j)
- {
- u[i][j] = static_cast<uint8_t>(0);
- }
- else
- {
- u[i][j] = static_cast<int32_t >(*(inptr_row + (j - in_pad_left)*in_col_stride)) + _input_offset;
- }
- }
- }
- inptr_base++;
-
- // Load weights tile
- int32_t w[kernel_rows][kernel_cols];
- for (int i = 0; i < kernel_rows; i++)
- {
- const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
- for (int j = 0; j < kernel_cols; j++)
- {
- w[i][j] = static_cast<int32_t >(*(wptr_row + j*weight_col_stride)) + _weights_offset;
- }
- }
- wptr_base++;
-
- // Perform the convolution
- int32_t v[output_tile_rows][output_tile_cols];
- for (int out_i = 0; out_i < out_cells_i; out_i++)
- {
- for (int out_j = 0; out_j < out_cells_j; out_j++)
- {
- // Clear the accumulator
- v[out_i][out_j] = static_cast<int32_t>(0);
-
- // Base co-ordinate
- const int base_i = out_i * stride_rows;
- const int base_j = out_j * stride_cols;
-
- // Fill the accumulator
- for (int in_i = 0; in_i < kernel_rows; in_i++)
- {
- const int i = base_i + in_i;
- for (int in_j = 0; in_j < kernel_cols; in_j++)
- {
- const int j = base_j + in_j;
- v[out_i][out_j] += w[in_i][in_j] * u[i][j];
- }
- }
- }
- }
-
- // Store the output tile
- for (int i = 0; i < out_cells_i; i++)
- {
- int32_t* const outptr_row = outptr_base + i*out_row_stride;
- for (int j = 0; j < out_cells_j; j++)
- {
- *(outptr_row + j*out_col_stride) = v[i][j];
- }
- }
- outptr_base++;
- }
-}
-
-} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp
deleted file mode 100644
index ac83bf9..0000000
--- a/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/gemm.hpp"
-
-using namespace winograd;
-
-template <const int MB, const int NB, typename TIn, typename TOut>
-BatchedBlockedGemm<MB, NB, TIn, TOut>::BatchedBlockedGemm(
- const unsigned int n_gemms,
- const int M, const int K, const int N,
- const int a_matrix_stride,
- const int a_row_stride,
- const int b_matrix_stride,
- const int b_row_stride,
- const int c_matrix_stride,
- const int c_row_stride,
- const TIn* const a_ptr,
- const TIn* const b_ptr,
- TOut* const c_ptr
-) : n_gemms(n_gemms), M(M), N(N), K(K),
- a_matrix_stride(a_matrix_stride),
- a_row_stride(a_row_stride),
- b_matrix_stride(b_matrix_stride),
- b_row_stride(b_row_stride),
- c_matrix_stride(c_matrix_stride),
- c_row_stride(c_row_stride),
- a_ptr(a_ptr), b_ptr(b_ptr), c_ptr(c_ptr)
-{
-}
-
-template <const int MBlock, const int NBlock, typename TIn, typename TOut>
-unsigned int BatchedBlockedGemm<MBlock, NBlock, TIn, TOut>::get_window() const
-{
- return n_gemms;
-}
-
-template <const int MBlock, const int NBlock, typename TIn, typename TOut>
-void BatchedBlockedGemm<MBlock, NBlock, TIn, TOut>::run(
- const unsigned int start, const unsigned int stop
-)
-{
- // Perform the specified GEMMs
- for (unsigned int i = start; i < stop; i++)
- {
- // Get pointers to the relevant matrices
- const TIn* const mtr_a = a_ptr + i*a_matrix_stride;
- const TIn* const mtr_b = b_ptr + i*b_matrix_stride;
- TOut* const mtr_c = c_ptr + i*c_matrix_stride;
-
- // Perform the GEMM
- BlockedGemm<MBlock, NBlock, TIn, TOut>(
- mtr_a, mtr_b, mtr_c, M, K, N,
- a_row_stride, b_row_stride, c_row_stride
- );
- }
-}
-
-template class winograd::BatchedBlockedGemm<4, 16, float, float>;
-
diff --git a/src/core/NEON/kernels/convolution/winograd/padding.cpp b/src/core/NEON/kernels/convolution/winograd/padding.cpp
new file mode 100644
index 0000000..46fe57c
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/padding.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cstring>
+#include <cstdint>
+
+#include "padding.hpp"
+
+namespace padding
+{
+
+template <typename T>
+void copy_and_pad_tile(
+ const unsigned int tile_rows,
+ const unsigned int tile_cols,
+ const unsigned int n_channels,
+ const T* const inptr,
+ const unsigned int in_row_stride,
+ const unsigned int in_col_stride,
+ T* const outptr,
+ const unsigned int out_row_stride,
+ const unsigned int out_col_stride,
+ const unsigned int pad_top,
+ const unsigned int pad_left,
+ const unsigned int pad_bottom,
+ const unsigned int pad_right,
+ const T pad_value
+)
+{
+ for (unsigned int out_i = 0; out_i < tile_rows; out_i++)
+ {
+ for (unsigned int out_j = 0; out_j < tile_cols; out_j++)
+ {
+ T* const output = outptr + out_i*out_row_stride + out_j*out_col_stride;
+
+ if (out_i < pad_top || tile_rows - pad_bottom <= out_i ||
+ out_j < pad_left || tile_cols - pad_right <= out_j)
+ {
+ for (unsigned int n = 0; n < n_channels; n++)
+ {
+ output[n] = pad_value;
+ }
+ }
+ else
+ {
+ const auto in_i = out_i - pad_top, in_j = out_j - pad_left;
+ const T* const input = inptr + in_i*in_row_stride + in_j*in_col_stride;
+ std::memcpy(output, input, n_channels * sizeof(T));
+ }
+ }
+ }
+}
+
+template void copy_and_pad_tile(
+ unsigned int, unsigned int, unsigned int,
+ const uint8_t *, unsigned int, unsigned int,
+ uint8_t *, unsigned int, unsigned int,
+ unsigned int, unsigned int, unsigned int, unsigned int, uint8_t
+);
+
+template void copy_and_pad_tile(
+ unsigned int, unsigned int, unsigned int,
+ const float *, unsigned int, unsigned int,
+ float *, unsigned int, unsigned int,
+ unsigned int, unsigned int, unsigned int, unsigned int, float
+);
+
+template <unsigned int TileRows, unsigned int TileCols>
+void CopyCropped<TileRows, TileCols>::execute(
+ const size_t size,
+ const void * const inptr,
+ const size_t in_row_stride,
+ const size_t in_col_stride,
+ void * const outptr,
+ const size_t out_row_stride,
+ const size_t out_col_stride,
+ const unsigned int pad_top,
+ const unsigned int pad_left,
+ const unsigned int pad_bottom,
+ const unsigned int pad_right
+)
+{
+ for (unsigned int out_i = 0, in_i = pad_top; in_i < TileRows - pad_bottom; out_i++, in_i++)
+ {
+ for (unsigned int out_j = 0, in_j = pad_left; in_j < TileCols - pad_right; out_j++, in_j++)
+ {
+ std::memcpy(
+ static_cast<uint8_t *>(outptr) + out_i*out_row_stride + out_j*out_col_stride,
+ static_cast<const uint8_t *>(inptr) + in_i*in_row_stride + in_j*in_col_stride,
+ size
+ );
+ }
+ }
+}
+
+template class CopyCropped<2, 2>;
+template class CopyCropped<3, 3>;
+template class CopyCropped<4, 4>;
+
+template <typename T>
+void crop_and_copy_tile(
+ unsigned int tile_rows,
+ unsigned int tile_cols,
+ unsigned int n_channels,
+ const T *inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ T *outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride,
+ unsigned int crop_top,
+ unsigned int crop_left,
+ unsigned int crop_bottom,
+ unsigned int crop_right
+)
+{
+ for (unsigned int out_i = 0, in_i = crop_top; in_i < tile_rows - crop_bottom; out_i++, in_i++)
+ {
+ for (unsigned int out_j = 0, in_j = crop_left; in_j < tile_cols - crop_right; out_j++, in_j++)
+ {
+ std::memcpy(
+ outptr + out_i*out_row_stride + out_j*out_col_stride,
+ inptr + in_i*in_row_stride + in_j*in_col_stride,
+ sizeof(T) * n_channels
+ );
+ }
+ }
+}
+
+template void crop_and_copy_tile(
+ unsigned int tile_rows,
+ unsigned int tile_cols,
+ unsigned int n_channels,
+ const float *inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ float *outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride,
+ unsigned int crop_top,
+ unsigned int crop_left,
+ unsigned int crop_bottom,
+ unsigned int crop_right
+);
+
+} // namespace padding
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
deleted file mode 100644
index e66300d..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
-void winograd_input_transform_1x8_fp32_process_tile(
- int n_channels,
- const float* const input_base,
- const int input_row_stride,
- const int input_col_stride,
- float* const matrix_base,
- const int matrix_stride,
- const int _pad_top,
- const int _pad_left,
- const int _pad_bottom,
- const int _pad_right
-)
-{
- (void) input_row_stride; // No rows over which to stride
- (void) _pad_top; // Never any top padding
- (void) _pad_bottom; // Never any bottom padding
-
- // Extract padding arguments
- const int pad_left = Specialized ? PadLeft : _pad_left;
- const int pad_right = Specialized ? PadRight : _pad_right;
-
- constexpr int inner_tile_cols = 8;
- const int cells_j = inner_tile_cols - pad_right;
-
- float *outptr = matrix_base;
-
- // Get pointers into the input tile
- const float *x_ptrs[inner_tile_cols];
- for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
- {
- x_ptrs[j] = input_base + xj*input_col_stride;
- }
-
- // Vectors used/computed in this kernel.
- float x[inner_tile_cols];
- float U[inner_tile_cols];
-
- for (int j = 0; j < inner_tile_cols; j++)
- {
- x[j] = 0.0f;
- }
-
- // Perform the Winograd input transformation for each channel in the input
- // tensor.
- int channels_remaining = n_channels;
-#ifdef __arm_any__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- float32x4_t x[inner_tile_cols], U[inner_tile_cols];
- for (int j = 0; j < inner_tile_cols; j++)
- {
- x[j] = vdupq_n_f32(0.0f);
- }
-
- // Load x
- for (int j = pad_left; j < cells_j; j++)
- {
- x[j] = vld1q_f32(x_ptrs[j]);
- x_ptrs[j] += 4;
- }
-
- // Compute U = x . X
- U[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
- U[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
- U[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
- U[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
- U[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
- U[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
- U[6] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
- U[7] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
-
- // Store the transformed vector
- for (int j = 0; j < inner_tile_cols; j++)
- {
- vst1q_f32(outptr + j*matrix_stride, U[j]);
- }
- outptr += 4;
- }
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- float32x2_t x[inner_tile_cols], U[inner_tile_cols];
- for (int j = 0; j < inner_tile_cols; j++)
- {
- x[j] = vdup_n_f32(0.0f);
- }
-
- // Load x
- for (int j = pad_left; j < cells_j; j++)
- {
- x[j] = vld1_f32(x_ptrs[j]);
- x_ptrs[j] += 2;
- }
-
- // Compute U = x . X
- U[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
- U[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
- U[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
- U[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
- U[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
- U[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
- U[6] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
- U[7] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
-
- // Store the transformed vector
- for (int j = 0; j < inner_tile_cols; j++)
- {
- vst1_f32(outptr + j*matrix_stride, U[j]);
- }
- outptr += 2;
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Load x
- for (int j = pad_left; j < cells_j; j++)
- {
- x[j] = *(x_ptrs[j]++);
- }
-
- // Compute U = x . X
- U[0] = x[0]*-36 + x[4]*-14 + x[2]*49 + x[6]*1;
- U[1] = x[5]*-1 + x[1]*-36 + x[4]*-13 + x[3]*13 + x[2]*36 + x[6]*1;
- U[2] = x[3]*-13 + x[4]*-13 + x[1]*36 + x[2]*36 + x[5]*1 + x[6]*1;
- U[3] = x[1]*-18 + x[4]*-10 + x[5]*-2 + x[2]*9 + x[3]*20 + x[6]*1;
- U[4] = x[3]*-20 + x[4]*-10 + x[5]*2 + x[2]*9 + x[1]*18 + x[6]*1;
- U[5] = x[1]*-12 + x[4]*-5 + x[5]*-3 + x[2]*4 + x[3]*15 + x[6]*1;
- U[6] = x[3]*-15 + x[4]*-5 + x[5]*3 + x[2]*4 + x[1]*12 + x[6]*1;
- U[7] = x[1]*-36 + x[5]*-14 + x[3]*49 + x[7]*1;
-
- // Store the transformed vector
- for (int j = 0; j < inner_tile_cols; j++)
- {
- *(outptr + j*matrix_stride) = U[j];
- }
- outptr++;
- }
-}
-
-}
-
-namespace winograd
-{
-template <int x>
-using Tiles = InputTransformImplTiles<1, x, 1, 8, float>;
-
-/*****************************************************************************/
-// 1x3 specialisations
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
-};
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
-};
-/*****************************************************************************/
-
-/*****************************************************************************/
-// 1x5 specialisations
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 2, 0, 0>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
-};
-/*****************************************************************************/
-
-/*****************************************************************************/
-// 1x7 specialisations
-template <>
-const Tiles<7>::TileFn Tiles<7>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
-
-template <>
-const Tiles<7>::TileFn Tiles<7>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
-
-template <>
-const Tiles<7>::TileFn Tiles<7>::tilefn_left_padded[n_pad_left] = {
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 3, 0, 0>,
-};
-
-template <>
-const Tiles<7>::TileFn Tiles<7>::tilefn_right_padded[n_pad_right] = {
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
- winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
-};
-/*****************************************************************************/
-
-
-template class InputTransform<1, 3, 1, 8, float>;
-template class InputTransform<3, 1, 8, 1, float>;
-template class InputTransform<1, 5, 1, 8, float>;
-template class InputTransform<5, 1, 8, 1, float>;
-template class InputTransform<1, 7, 1, 8, float>;
-template class InputTransform<7, 1, 8, 1, float>;
-} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
deleted file mode 100644
index 4203945..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace winograd
-{
-
-using Tiles = InputTransformImplTiles<3, 3, 4, 4, float>;
-
-namespace
-{
-
-
-template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
-void winograd_input_transform_4x4_fp32_process_tile(
- int n_channels,
- const float* const input_base,
- const int input_row_stride,
- const int input_col_stride,
- float* const matrix_base,
- const int matrix_stride,
- const int _pad_top,
- const int _pad_left,
- const int _pad_bottom,
- const int _pad_right
- )
-{
-const int pad_top = Specialized ? PadTop : _pad_top;
- const int pad_left = Specialized ? PadLeft : _pad_left;
- const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
- const int pad_right = Specialized ? PadRight : _pad_right;
-
- constexpr int inner_tile_i = 4, inner_tile_j = 4;
- const int cells_i = inner_tile_i - pad_bottom;
- const int cells_j = inner_tile_i - pad_right;
-
-
-
- float *outptr = matrix_base;
-
- // Get pointers into the input tile
- const float *x_ptrs[inner_tile_i][inner_tile_j];
- for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
- {
- // Get a pointer into the row
- const float* const row_ptr = input_base + xi*input_row_stride;
-
- for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
- {
- x_ptrs[i][j] = row_ptr + xj*input_col_stride;
- }
- }
-
- // Matrices used/computed in this kernel.
- float x[inner_tile_i][inner_tile_j];
- float XTx[inner_tile_i][inner_tile_j];
- float U[inner_tile_i][inner_tile_j];
-
- for (int i = 0; i < inner_tile_i; i++)
- {
- for (int j = 0; j < inner_tile_j; j++)
- {
- x[i][j] = XTx[i][j] = 0.0f;
- }
- }
-
- // Perform the Winograd input transformation for each channel in the input
- // tensor.
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used/computed in this kernel.
- float32x4_t x[inner_tile_i][inner_tile_j];
- float32x4_t XTx[inner_tile_i][inner_tile_j];
- float32x4_t U[inner_tile_i][inner_tile_j];
-
- for (int i = 0; i < inner_tile_i; i++)
- {
- for (int j = 0; j < inner_tile_j; j++)
- {
- x[i][j] = vdupq_n_f32(0.0f);
- XTx[i][j] = vdupq_n_f32(0.0f);
- }
- }
-
- // Load x
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = vld1q_f32(x_ptrs[i][j]);
- x_ptrs[i][j] += 4;
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- // XTx[0][j] = x[0][j] - x[2][j];
- XTx[0][j] = vsubq_f32(x[0][j], x[2][j]);
-
- // XTx[1][j] = x[1][j] + x[2][j];
- XTx[1][j] = vaddq_f32(x[1][j], x[2][j]);
-
- // XTx[2][j] = x[2][j] - x[1][j];
- XTx[2][j] = vsubq_f32(x[2][j], x[1][j]);
-
- // XTx[3][j] = x[1][j] - x[3][j];
- XTx[3][j] = vsubq_f32(x[1][j], x[3][j]);
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < inner_tile_i; i++)
- {
- // U[i][0] = XTx[i][0] - XTx[i][2];
- U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]);
-
- // U[i][1] = XTx[i][1] + XTx[i][2];
- U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]);
-
- // U[i][2] = XTx[i][2] - XTx[i][1];
- U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]);
-
- // U[i][3] = XTx[i][1] - XTx[i][3];
- U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]);
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < inner_tile_i; i++)
- {
- for (int j = 0; j < inner_tile_j; j++, m++)
- {
- vst1q_f32(outptr + m*matrix_stride, U[i][j]);
- }
- }
- outptr += 4;
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used/computed in this kernel.
- float32x2_t x[inner_tile_i][inner_tile_j];
- float32x2_t XTx[inner_tile_i][inner_tile_j];
- float32x2_t U[inner_tile_i][inner_tile_j];
-
- for (int i = 0; i < inner_tile_i; i++)
- {
- for (int j = 0; j < inner_tile_j; j++)
- {
- x[i][j] = vdup_n_f32(0.0f);
- XTx[i][j] = vdup_n_f32(0.0f);
- }
- }
-
- // Load x
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = vld1_f32(x_ptrs[i][j]);
- x_ptrs[i][j] += 2;
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- // XTx[0][j] = x[0][j] - x[2][j];
- XTx[0][j] = vsub_f32(x[0][j], x[2][j]);
-
- // XTx[1][j] = x[1][j] + x[2][j];
- XTx[1][j] = vadd_f32(x[1][j], x[2][j]);
-
- // XTx[2][j] = x[2][j] - x[1][j];
- XTx[2][j] = vsub_f32(x[2][j], x[1][j]);
-
- // XTx[3][j] = x[1][j] - x[3][j];
- XTx[3][j] = vsub_f32(x[1][j], x[3][j]);
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < inner_tile_i; i++)
- {
- // U[i][0] = XTx[i][0] - XTx[i][2];
- U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]);
-
- // U[i][1] = XTx[i][1] + XTx[i][2];
- U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]);
-
- // U[i][2] = XTx[i][2] - XTx[i][1];
- U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]);
-
- // U[i][3] = XTx[i][1] - XTx[i][3];
- U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]);
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < inner_tile_i; i++)
- {
- for (int j = 0; j < inner_tile_j; j++, m++)
- {
- vst1_f32(outptr + m*matrix_stride, U[i][j]);
- }
- }
- outptr += 2;
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Load x
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = *(x_ptrs[i][j]++);
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- XTx[0][j] = x[0][j] - x[2][j];
- XTx[1][j] = x[1][j] + x[2][j];
- XTx[2][j] = x[2][j] - x[1][j];
- XTx[3][j] = x[1][j] - x[3][j];
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < inner_tile_i; i++)
- {
- U[i][0] = XTx[i][0] - XTx[i][2];
- U[i][1] = XTx[i][1] + XTx[i][2];
- U[i][2] = XTx[i][2] - XTx[i][1];
- U[i][3] = XTx[i][1] - XTx[i][3];
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < inner_tile_i; i++)
- {
- for (int j = 0; j < inner_tile_j; j++, m++)
- {
- *(outptr + m*matrix_stride) = U[i][j];
- }
- }
- outptr++;
- }
-}
-
-} // namespace (anonymous)
-
-template <>
-const Tiles::TileFn Tiles::tilefn_generic = winograd_input_transform_4x4_fp32_process_tile<false>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_input_transform_4x4_fp32_process_tile<true>;
-
-
-template <>
-const Tiles::TileFn Tiles::tilefn_top_padded[n_pad_top] = {
- winograd_input_transform_4x4_fp32_process_tile<true, 1, 0, 0, 0>,
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_left_padded[n_pad_left] = {
- winograd_input_transform_4x4_fp32_process_tile<true, 0, 1, 0, 0>,
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
- winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 1, 0>,
- winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 2, 0>,
- winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 3, 0>,
- winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 4, 0>,
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
- winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 1>,
- winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 2>,
- winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 3>,
- winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 4>,
-};
-
-template class InputTransform<3, 3, 4, 4, float>;
-} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
deleted file mode 100644
index 893122c..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
-void winograd_input_transform_6x6_fp32_process_tile(
- int n_channels,
- const float* const input_base,
- const int input_row_stride,
- const int input_col_stride,
- float* const matrix_base,
-const int matrix_stride,
- const int _pad_top,
- const int _pad_left,
- const int _pad_bottom,
- const int _pad_right
-)
-{
- const int pad_top = Specialized ? PadTop : _pad_top;
- const int pad_left = Specialized ? PadLeft : _pad_left;
- const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
- const int pad_right = Specialized ? PadRight : _pad_right;
-
- constexpr int inner_tile_rows = 6;
- constexpr int inner_tile_cols = 6;
-
- const int cells_i = inner_tile_rows - pad_bottom;
- const int cells_j = inner_tile_cols - pad_right;
-
- float *outptr = matrix_base;
-
- // Get pointers into the input tile
- const float *x_ptrs[inner_tile_rows][inner_tile_cols];
- for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
- {
- // Get a pointer into the row
- const float* const row_ptr = input_base + xi*input_row_stride;
-
- for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
- {
- x_ptrs[i][j] = row_ptr + xj*input_col_stride;
- }
- }
-
- // Matrices used/computed in this kernel.
- float x[inner_tile_rows][inner_tile_cols];
- float XTx[inner_tile_rows][inner_tile_cols];
- float U[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
- {
- for (int j = 0; j < inner_tile_cols; j++)
- {
- x[i][j] = XTx[i][j] = 0.0f;
- }
- }
-
- // Perform the Winograd input transformation for each channel in the input
- // tensor.
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used/computed in this kernel
- float32x4_t x[inner_tile_rows][inner_tile_cols];
- float32x4_t XTx[inner_tile_rows][inner_tile_cols];
- float32x4_t U[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
- {
- for (int j = 0; j < inner_tile_cols; j++)
- {
- x[i][j] = vdupq_n_f32(0.0f);
- XTx[i][j] = vdupq_n_f32(0.0f);
- }
- }
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = vld1q_f32(x_ptrs[i][j]);
- x_ptrs[i][j] += 4;
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
- XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
- // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
- XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
- XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
- XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f);
-
- // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
- XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f);
-
- // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
- XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < inner_tile_rows; i++)
- {
- // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
- U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
- // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
- U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
- U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
- U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
- // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
- U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
- // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
- U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < inner_tile_rows; i++)
- {
- for (int j = 0; j < inner_tile_cols; j++, m++)
- {
- vst1q_f32(outptr + m*matrix_stride, U[i][j]);
- }
- }
- outptr += 4;
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used/computed in this kernel
- float32x2_t x[inner_tile_rows][inner_tile_cols];
- float32x2_t XTx[inner_tile_rows][inner_tile_cols];
- float32x2_t U[inner_tile_rows][inner_tile_cols];
- for (int i = 0; i < inner_tile_rows; i++)
- {
- for (int j = 0; j < inner_tile_cols; j++)
- {
- x[i][j] = vdup_n_f32(0.0f);
- XTx[i][j] = vdup_n_f32(0.0f);
- }
- }
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = vld1_f32(x_ptrs[i][j]);
- x_ptrs[i][j] += 2;
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
- XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
- // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
- XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
- XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
- XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
-
- // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
- XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
-
- // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
- XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < inner_tile_rows; i++)
- {
- // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
- U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
- // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
- U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
- U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
- U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
- // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
- U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
- // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
- U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < inner_tile_rows; i++)
- {
- for (int j = 0; j < inner_tile_cols; j++, m++)
- {
- vst1_f32(outptr + m*matrix_stride, U[i][j]);
- }
- }
- outptr += 2;
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Load x
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = *(x_ptrs[i][j]++);
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
- XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
- XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
- XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
- XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
- XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < inner_tile_rows; i++)
- {
- U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
- U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
- U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
- U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
- U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
- U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < inner_tile_rows; i++)
- {
- for (int j = 0; j < inner_tile_cols; j++, m++)
- {
- *(outptr + m*matrix_stride) = U[i][j];
- }
- }
- outptr++;
- }
-}
-}
-
-namespace winograd
-{
-template <int k>
-using Tiles = InputTransformImplTiles<k, k, 6, 6, float>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_top_padded[n_pad_top] = {
- winograd_input_transform_6x6_fp32_process_tile<true, 1, 0, 0, 0>,
-};
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 1, 0, 0>,
-};
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_bottom_padded[n_pad_bottom] = {
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
-};
-
-template <>
-const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
-
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
-
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_top_padded[n_pad_top] = {
- winograd_input_transform_6x6_fp32_process_tile<true, 2, 0, 0, 0>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 2, 0, 0>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_bottom_padded[n_pad_bottom] = {
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
-};
-
-template <>
-const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
- winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
-};
-
-template class InputTransform<3, 3, 6, 6, float>;
-template class InputTransform<5, 5, 6, 6, float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
deleted file mode 100644
index 597b074..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadBottom=0, int PadRight=0>
-void winograd_output_transform_2x2_3x3_fp32_process_tile(
- const int n_channels,
- const float* const matrix_base,
- const int matrix_stride,
- const float* const biases,
- float* const output,
- const int output_row_stride,
- const int output_col_stride,
- const int _pad_bottom,
- const int _pad_right
-)
-{
- constexpr int OutputTileRows = 2, OutputTileCols = 2;
- const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
- const int pad_right = Specialized ? PadRight : _pad_right;
-
- const int cells_i = OutputTileRows - pad_bottom;
- const int cells_j = OutputTileCols - pad_right;
-
- // Construct a map to the output cells
- float *outptrs[OutputTileRows][OutputTileCols];
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
- }
- }
- const float *inptr = matrix_base;
- const float *bptr = biases;
-
- if (bptr)
- {
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used and computed during this transform
- float32x4_t F[4][4], FZ[4][2], f[2][2], b;
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
- {
- for (int j = 0; j < 4; j++, m++)
- {
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 4;
-
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
-
- // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
- // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
- }
-
- // Load the bias vector
- b = vld1q_f32(bptr);
- bptr += 4;
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
- outptrs[i][j] += 4;
- }
- }
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[4][4], FZ[4][2], f[2][2], b;
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
- {
- for (int j = 0; j < 4; j++, m++)
- {
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 2;
-
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
-
- // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
- // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
- }
-
- // Load the bias vector
- b = vld1_f32(bptr);
- bptr += 2;
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
- outptrs[i][j] += 2;
- }
- }
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed during this transform
- float F[4][4], FZ[4][2], f[2][2], b;
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
- {
- for (int j = 0; j < 4; j++, m++)
- {
- F[i][j] = *(inptr + m*matrix_stride);
- }
- }
- inptr++;
-
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- }
-
- // Load the bias
- b = *(bptr++);
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- *(outptrs[i][j]++) = f[i][j] + b;
- }
- }
- }
- }
- else
- {
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used and computed during this transform
- float32x4_t F[4][4], FZ[4][2], f[2][2];
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
- {
- for (int j = 0; j < 4; j++, m++)
- {
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 4;
-
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
-
- // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
- // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
- }
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1q_f32(outptrs[i][j], f[i][j]);
- outptrs[i][j] += 4;
- }
- }
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[4][4], FZ[4][2], f[2][2];
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
- {
- for (int j = 0; j < 4; j++, m++)
- {
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 2;
-
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
-
- // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
- // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
- }
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1_f32(outptrs[i][j], f[i][j]);
- outptrs[i][j] += 2;
- }
- }
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed during this transform
- float F[4][4], FZ[4][2], f[2][2];
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
- {
- for (int j = 0; j < 4; j++, m++)
- {
- F[i][j] = *(inptr + m*matrix_stride);
- }
- }
- inptr++;
-
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- }
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- *(outptrs[i][j]++) = f[i][j];
- }
- }
- }
- }
-}
-
-} // namespace (anonymous)
-
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<3, 3, 4, 4, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_3x3_fp32_process_tile<false>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_3x3_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
- winograd_output_transform_2x2_3x3_fp32_process_tile<true, 1, 0>
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
- winograd_output_transform_2x2_3x3_fp32_process_tile<true, 0, 1>
-};
-
-template class OutputTransform<3, 3, 4, 4, float>;
-} // namespace winograd
-
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
deleted file mode 100644
index 60d7181..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadBottom=0, int PadRight=0>
-void winograd_output_transform_2x2_5x5_fp32_process_tile(
- const int n_channels,
- const float* const matrix_base,
- const int matrix_stride,
- const float* const biases,
- float* const output,
- const int output_row_stride,
- const int output_col_stride,
- const int _pad_bottom,
- const int _pad_right
-)
-{
- constexpr int OutputTileRows = 2, OutputTileCols = 2;
- const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
- const int pad_right = Specialized ? PadRight : _pad_right;
-
- const int cells_i = 2 - pad_bottom;
- const int cells_j = 2 - pad_right;
-
- // Construct a map to the output cells
- float *outptrs[OutputTileRows][OutputTileCols];
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
- }
- }
- const float *inptr = matrix_base;
- const float *bptr = biases;
-
- if (bptr)
- {
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used and computed during this transform
- float32x4_t F[6][6], FZ[6][2], f[2][2], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 4;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- b = vld1q_f32(bptr);
- bptr += 4;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
- outptrs[i][j] += 4;
- }
- }
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[6][6], FZ[6][2], f[2][2], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 2;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- b = vld1_f32(bptr);
- bptr += 2;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
- outptrs[i][j] += 2;
- }
- }
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed during this transform
- float F[6][6], FZ[6][2], f[2][2], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = *(inptr + m*matrix_stride);
- }
- }
- inptr++;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- }
-
- // Write out the output tile
- b = *(bptr++);
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- *(outptrs[i][j]++) = f[i][j] + b;
- }
- }
- }
- }
- else
- {
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used and computed during this transform
- float32x4_t F[6][6], FZ[6][2], f[2][2];
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 4;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1q_f32(outptrs[i][j], f[i][j]);
- outptrs[i][j] += 4;
- }
- }
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[6][6], FZ[6][2], f[2][2];
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 2;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1_f32(outptrs[i][j], f[i][j]);
- outptrs[i][j] += 2;
- }
- }
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed during this transform
- float F[6][6], FZ[6][2], f[2][2];
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = *(inptr + m*matrix_stride);
- }
- }
- inptr++;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- }
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- *(outptrs[i][j]++) = f[i][j];
- }
- }
- }
- }
-}
-
-} // namespace (anonymous)
-
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<5, 5, 6, 6, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_5x5_fp32_process_tile<false>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_5x5_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
- winograd_output_transform_2x2_5x5_fp32_process_tile<true, 1, 0>
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
- winograd_output_transform_2x2_5x5_fp32_process_tile<true, 0, 1>
-};
-
-template class OutputTransform<5, 5, 6, 6, float>;
-} // namespace winograd
-
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
deleted file mode 100644
index 15cc04b..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace
-{
-
-template <bool Specialized, int PadBottom=0, int PadRight=0>
-void winograd_output_transform_4x4_3x3_fp32_process_tile(
- const int n_channels,
- const float* const matrix_base,
- const int matrix_stride,
- const float* const biases,
- float* const output,
- const int output_row_stride,
- const int output_col_stride,
- const int _pad_bottom,
- const int _pad_right
-)
-{
- const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
- const int pad_right = Specialized ? PadRight : _pad_right;
- constexpr int TileRows = 4, TileCols = 4;
-
- const int cells_i = TileRows - pad_bottom;
- const int cells_j = TileCols - pad_right;
-
- // Construct a map to the output cells
- float *outptrs[TileRows][TileCols];
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
- }
- }
- const float *inptr = matrix_base;
- const float *bptr = biases;
-
- if (bptr)
- {
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used and computed during this transform
- float32x4_t F[6][6], FZ[6][4], f[4][4], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 4;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
-
- // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
-
- // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
- // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
- // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- b = vld1q_f32(bptr);
- bptr += 4;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
- outptrs[i][j] += 4;
- }
- }
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[6][6], FZ[6][4], f[4][4], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 2;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
-
- // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
-
- // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
- // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
- // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- b = vld1_f32(bptr);
- bptr += 2;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
- outptrs[i][j] += 2;
- }
- }
- }
-#endif
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed during this transform
- float F[6][6], FZ[6][4], f[4][4], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = *(inptr + m*matrix_stride);
- }
- }
- inptr++;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- }
-
- // Write out the output tile
- b = *(bptr++);
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- *(outptrs[i][j]++) = f[i][j] + b;
- }
- }
- }
- }
- else
- {
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used and computed during this transform
- float32x4_t F[6][6], FZ[6][4], f[4][4];
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 4;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
-
- // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
-
- // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
- // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
- // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1q_f32(outptrs[i][j], f[i][j]);
- outptrs[i][j] += 4;
- }
- }
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[6][6], FZ[6][4], f[4][4];
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
- }
- }
- inptr += 2;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
-
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
-
- // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
-
- // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
- // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
- // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
- }
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- vst1_f32(outptrs[i][j], f[i][j]);
- outptrs[i][j] += 2;
- }
- }
- }
-#endif
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed during this transform
- float F[6][6], FZ[6][4], f[4][4];
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- F[i][j] = *(inptr + m*matrix_stride);
- }
- }
- inptr++;
-
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- }
-
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- }
-
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
- {
- *(outptrs[i][j]++) = f[i][j];
- }
- }
- }
- }
-}
-
-} // namespace (anonymous)
-
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<3, 3, 6, 6, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_4x4_3x3_fp32_process_tile<false>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4x4_3x3_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
- winograd_output_transform_4x4_3x3_fp32_process_tile<true, 1, 0>,
- winograd_output_transform_4x4_3x3_fp32_process_tile<true, 2, 0>,
- winograd_output_transform_4x4_3x3_fp32_process_tile<true, 3, 0>,
-};
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
- winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 1>,
- winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 2>,
- winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 3>,
-};
-
-template class OutputTransform<3, 3, 6, 6, float>;
-} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp
deleted file mode 100644
index 85cf418..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
- template <>
- template <>
- void WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>::execute(
- const int n_output_channels,
- const int n_input_channels,
- const float* const input, // NOTE: Data in HWIO order
- float* const output,
- const int matrix_stride,
- const int matrix_row_stride
- )
- {
- // Get pointers to each cell of the weight tensor
- const auto weight_col_stride = n_input_channels * n_output_channels;
- const float *inptrs[kernel_cols];
- for (int j = 0; j < kernel_cols; j++)
- {
- inptrs[j] = input + j*weight_col_stride;
- }
-
- // For each input channel
- for (int ic = 0; ic < n_input_channels; ic++)
- {
- float *outptr = output + ic * matrix_row_stride;
-
- // For each output channel
- int channels_remaining = n_output_channels;
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed in this kernel
- float w[kernel_cols], V[inner_tile_cols];
-
- // Read weights
- for (int j = 0; j < kernel_cols; j++)
- {
- w[j] = *(inptrs[j]++);
- }
-
- // Compute V = w WT
- V[0] = (w[0]*-1) / 36.0f;
- V[1] = (w[1]*-1 + w[3]*-1 + w[5]*-1 + w[0]*1 + w[2]*1 + w[4]*1 + w[6]*1) / 48.0f;
- V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1 + w[5]*1 + w[6]*1) / 48.0f;
- V[3] = (w[0]*-1 + w[6]*-64 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8 + w[5]*32) / 120.0f;
- V[4] = (w[0]*-1 + w[6]*-64 + w[5]*-32 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120.0f;
- V[5] = (w[5]*-243 + w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[6]*729 + w[0]*1) / 720.0f;
- V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[5]*243 + w[6]*729 + w[0]*1) / 720.0f;
- V[7] = (w[6]*1) / 1.0f;
-
- // Store the transformed weights
- for (int j = 0; j < inner_tile_cols; j++)
- {
- *(outptr + j*matrix_stride) = V[j];
- }
- outptr++;
- }
- }
- }
-
- template <>
- template <>
- int WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
- {
- (void) shape;
- return 0; // TODO
- }
-
- template <>
- template <>
- void WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>::execute(
- const int n_output_channels,
- const int n_input_channels,
- const float* const input, // NOTE: Data in HWIO order
- float* const output,
- const int matrix_stride,
- const int matrix_row_stride
- )
- {
- // Redirect to the 1xN implementation
- WinogradGEMM<1, 2, 1, 7>::template WeightsTransform<float>::execute(
- n_output_channels, n_input_channels, input, output, matrix_stride,
- matrix_row_stride
- );
- }
-
- template <>
- template <>
- int WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
- {
- (void) shape;
- return 0; // TODO
- }
-
- template struct WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>;
- template struct WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp
deleted file mode 100644
index 6c71461..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
- template <>
- template <>
- void WinogradGEMM<2, 2, 3, 3>::WeightsTransform<float>::execute(
- const int n_output_channels,
- const int n_input_channels,
- const float* const input,
- float* const output,
- const int matrix_stride,
- const int matrix_row_stride
- )
- {
- constexpr int inner_tile_i = 4;
- constexpr int inner_tile_j = 4;
-
- // Get pointers to each cell of the weight tensor
- const auto weight_col_stride = n_input_channels * n_output_channels;
- const auto weight_row_stride = 3 * weight_col_stride;
- const float *inptrs[3][3];
- for (int i = 0; i < 3; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
- }
- }
-
- // For each input channel
- for (int ic = 0; ic < n_input_channels; ic++)
- {
- float *outptr = output + ic * matrix_row_stride;
-
- // For each output channel
- int channels_remaining = n_output_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used and computed in this kernel
- float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
-
- // Read weights
- for (int i = 0; i < 3; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- w[i][j] = vld1q_f32(inptrs[i][j]);
- inptrs[i][j] += 4;
- }
- }
-
- // Compute the matrix W w
- for (int j = 0; j < 3; j++)
- {
- Ww[0][j] = w[0][j];
-
- // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
- Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
- // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
- Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
- Ww[3][j] = w[2][j];
- }
-
- // Compute V = W w WT
- for (int i = 0; i < inner_tile_i; i++)
- {
- V[i][0] = Ww[i][0];
-
- // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
- V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
- // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
- V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
- V[i][3] = Ww[i][2];
- }
-
- // Store the transformed weights
- for (int i = 0, m = 0; i < inner_tile_i; i++)
- {
- for (int j = 0; j < inner_tile_j; j++, m++)
- {
- vst1q_f32(outptr + m*matrix_stride, V[i][j]);
- }
- }
- outptr += 4;
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed in this kernel
- float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
-
- // Read weights
- for (int i = 0; i < 3; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- w[i][j] = vld1_f32(inptrs[i][j]);
- inptrs[i][j] += 2;
- }
- }
-
- // Compute the matrix W w
- for (int j = 0; j < 3; j++)
- {
- Ww[0][j] = w[0][j];
-
- // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
- Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
- // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
- Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
-
- Ww[3][j] = w[2][j];
- }
-
- // Compute V = W w WT
- for (int i = 0; i < inner_tile_i; i++)
- {
- V[i][0] = Ww[i][0];
-
- // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
- V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
- // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
- V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
-
- V[i][3] = Ww[i][2];
- }
-
- // Store the transformed weights
- for (int i = 0, m = 0; i < inner_tile_i; i++)
- {
- for (int j = 0; j < inner_tile_j; j++, m++)
- {
- vst1_f32(outptr + m*matrix_stride, V[i][j]);
- }
- }
- outptr += 2;
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed in this kernel
- float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
-
- // Read weights
- for (int i = 0; i < 3; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- w[i][j] = *(inptrs[i][j]++);
- }
- }
-
- // Compute the matrix W w
- for (int j = 0; j < 3; j++)
- {
- Ww[0][j] = w[0][j];
- Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
- Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
- Ww[3][j] = w[2][j];
- }
-
- // Compute V = W w WT
- for (int i = 0; i < inner_tile_i; i++)
- {
- V[i][0] = Ww[i][0];
- V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
- V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
- V[i][3] = Ww[i][2];
- }
-
- // Store the transformed weights
- for (int i = 0, m = 0; i < inner_tile_i; i++)
- {
- for (int j = 0; j < inner_tile_j; j++, m++)
- {
- *(outptr + m*matrix_stride) = V[i][j];
- }
- }
- outptr++;
- }
- }
- }
-
- template <>
- template <>
- int WinogradGEMM<2, 2, 3, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
- {
- const int channel_prod = shape.n_input_channels * shape.n_output_channels;
- return 2 * 18 * channel_prod;
- }
-
- template struct WinogradGEMM<2, 2, 3, 3>::WeightsTransform<float>;
-} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
deleted file mode 100644
index 2f4f6e1..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
- template <>
- template <>
- void WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>::execute(
- const int n_output_channels,
- const int n_input_channels,
- const float* const input,
- float* const output,
- const int matrix_stride,
- const int matrix_row_stride
- )
- {
- // Get pointers to each cell of the weight tensor
- const auto weight_col_stride = n_input_channels * n_output_channels;
- const auto weight_row_stride = 5 * weight_col_stride;
- const float *inptrs[5][5];
- for (int i = 0; i < 5; i++)
- {
- for (int j = 0; j < 5; j++)
- {
- inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
- }
- }
-
- // For each input channel
- for (int ic = 0; ic < n_input_channels; ic++)
- {
- float *outptr = output + ic * matrix_row_stride;
-
- // For each output channel
- int channels_remaining = n_output_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used and computed in this kernel
- float32x4_t w[5][5], Ww[6][5], V[6][6];
-
- // Read weights
- for (int i = 0; i < 5; i++)
- {
- for (int j = 0; j < 5; j++)
- {
- w[i][j] = vld1q_f32(inptrs[i][j]);
- inptrs[i][j] += 4;
- }
- }
-
- // Compute the matrix W w
- for (int j = 0; j < 5; j++)
- {
- // Ww[0][j] = w[0][j]/4.0f;
- Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f);
-
- // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
- Ww[1][j] = vmulq_n_f32(
- vaddq_f32(
- vaddq_f32(
- vaddq_f32(w[1][j], w[0][j]),
- vaddq_f32(w[3][j], w[2][j])
- ),
- w[4][j]
- ),
- -1.0f/6.0f
- );
-
- // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
- // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
- Ww[2][j] = vmulq_n_f32(
- vsubq_f32(
- vaddq_f32(
- vsubq_f32(w[1][j], w[0][j]),
- vsubq_f32(w[3][j], w[2][j])
- ),
- w[4][j]
- ),
- 1.0f/6.0f
- );
-
- // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
- Ww[3][j] = vmulq_n_f32(
- vmlaq_n_f32(
- vaddq_f32(
- vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
- vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
- ),
- w[4][j], 2.0f
- ),
- 1.0f/3.0f
- );
-
- // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
- Ww[4][j] = vmulq_n_f32(
- vmlaq_n_f32(
- vaddq_f32(
- vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
- vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
- ),
- w[4][j], 2.0f
- ),
- 1.0f/3.0f
- );
-
- // Ww[5][j] = w[4][j];
- Ww[5][j] = w[4][j];
- }
-
- // Compute V = W w WT
- for (int i = 0; i < 6; i++)
- {
- // V[i][0] = Ww[i][0]/4.0f;
- V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f);
-
- // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
- V[i][1] = vmulq_n_f32(
- vaddq_f32(
- vaddq_f32(
- vaddq_f32(Ww[i][1], Ww[i][0]),
- vaddq_f32(Ww[i][3], Ww[i][2])
- ),
- Ww[i][4]
- ),
- -1.0f/6.0f
- );
-
- // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
- // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
- V[i][2] = vmulq_n_f32(
- vsubq_f32(
- vaddq_f32(
- vsubq_f32(Ww[i][1], Ww[i][0]),
- vsubq_f32(Ww[i][3], Ww[i][2])
- ),
- Ww[i][4]
- ),
- 1.0f/6.0f
- );
-
- // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
- V[i][3] = vmulq_n_f32(
- vmlaq_n_f32(
- vaddq_f32(
- vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
- vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
- ),
- Ww[i][4], 2.0f
- ),
- 1.0f/3.0f
- );
-
- // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
- V[i][4] = vmulq_n_f32(
- vmlaq_n_f32(
- vaddq_f32(
- vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
- vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
- ),
- Ww[i][4], 2.0f
- ),
- 1.0f/3.0f
- );
-
- // V[i][5] = Ww[i][4];
- V[i][5] = Ww[i][4];
- }
-
- // Store the transformed weights
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- vst1q_f32(outptr + m*matrix_stride, V[i][j]);
- }
- }
- outptr += 4;
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed in this kernel
- float32x2_t w[5][5], Ww[6][5], V[6][6];
-
- // Read weights
- for (int i = 0; i < 5; i++)
- {
- for (int j = 0; j < 5; j++)
- {
- w[i][j] = vld1_f32(inptrs[i][j]);
- inptrs[i][j] += 2;
- }
- }
-
- // Compute the matrix W w
- for (int j = 0; j < 5; j++)
- {
- // Ww[0][j] = w[0][j]/4.0f;
- Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f);
-
- // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
- Ww[1][j] = vmul_n_f32(
- vadd_f32(
- vadd_f32(
- vadd_f32(w[1][j], w[0][j]),
- vadd_f32(w[3][j], w[2][j])
- ),
- w[4][j]
- ),
- -1.0f/6.0f
- );
-
- // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
- // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
- Ww[2][j] = vmul_n_f32(
- vsub_f32(
- vadd_f32(
- vsub_f32(w[1][j], w[0][j]),
- vsub_f32(w[3][j], w[2][j])
- ),
- w[4][j]
- ),
- 1.0f/6.0f
- );
-
- // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
- Ww[3][j] = vmul_n_f32(
- vmla_n_f32(
- vadd_f32(
- vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
- vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
- ),
- w[4][j], 2.0f
- ),
- 1.0f/3.0f
- );
-
- // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
- Ww[4][j] = vmul_n_f32(
- vmla_n_f32(
- vadd_f32(
- vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
- vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
- ),
- w[4][j], 2.0f
- ),
- 1.0f/3.0f
- );
-
- // Ww[5][j] = w[4][j];
- Ww[5][j] = w[4][j];
- }
-
- // Compute V = W w WT
- for (int i = 0; i < 6; i++)
- {
- // V[i][0] = Ww[i][0]/4.0f;
- V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f);
-
- // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
- V[i][1] = vmul_n_f32(
- vadd_f32(
- vadd_f32(
- vadd_f32(Ww[i][1], Ww[i][0]),
- vadd_f32(Ww[i][3], Ww[i][2])
- ),
- Ww[i][4]
- ),
- -1.0f/6.0f
- );
-
- // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
- // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
- V[i][2] = vmul_n_f32(
- vsub_f32(
- vadd_f32(
- vsub_f32(Ww[i][1], Ww[i][0]),
- vsub_f32(Ww[i][3], Ww[i][2])
- ),
- Ww[i][4]
- ),
- 1.0f/6.0f
- );
-
- // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
- V[i][3] = vmul_n_f32(
- vmla_n_f32(
- vadd_f32(
- vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
- vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
- ),
- Ww[i][4], 2.0f
- ),
- 1.0f/3.0f
- );
-
- // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
- V[i][4] = vmul_n_f32(
- vmla_n_f32(
- vadd_f32(
- vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
- vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
- ),
- Ww[i][4], 2.0f
- ),
- 1.0f/3.0f
- );
-
- // V[i][5] = Ww[i][4];
- V[i][5] = Ww[i][4];
- }
-
- // Store the transformed weights
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- vst1_f32(outptr + m*matrix_stride, V[i][j]);
- }
- }
- outptr += 2;
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed in this kernel
- float w[5][5], Ww[6][5], V[6][6];
-
- // Read weights
- for (int i = 0; i < 5; i++)
- {
- for (int j = 0; j < 5; j++)
- {
- w[i][j] = *(inptrs[i][j]++);
- }
- }
-
- // Compute the matrix W w
- for (int j = 0; j < 5; j++)
- {
- Ww[0][j] = w[0][j]/4.0f;
- Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
- Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
- Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
- Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
- Ww[5][j] = w[4][j];
- }
-
- // Compute V = W w WT
- for (int i = 0; i < 6; i++)
- {
- V[i][0] = Ww[i][0]/4.0f;
- V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
- V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
- V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
- V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
- V[i][5] = Ww[i][4];
- }
-
- // Store the transformed weights
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- *(outptr + m*matrix_stride) = V[i][j];
- }
- }
- outptr++;
- }
- }
- }
-
- template <>
- template <>
- int WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
- {
- return 0; // TODO
- }
-
- template class WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>;
-} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp
deleted file mode 100644
index 2f14e20..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
- template <>
- template <>
- void WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>::execute(
- const int n_output_channels,
- const int n_input_channels,
- const float* const input, // NOTE: Data in HWIO order
- float* const output,
- const int matrix_stride,
- const int matrix_row_stride
- )
- {
- // Get pointers to each cell of the weight tensor
- const auto weight_col_stride = n_input_channels * n_output_channels;
- const float *inptrs[kernel_cols];
- for (int j = 0; j < kernel_cols; j++)
- {
- inptrs[j] = input + j*weight_col_stride;
- }
-
- // For each input channel
- for (int ic = 0; ic < n_input_channels; ic++)
- {
- float *outptr = output + ic * matrix_row_stride;
-
- // For each output channel
- int channels_remaining = n_output_channels;
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed in this kernel
- float w[kernel_cols], V[inner_tile_cols];
-
- // Read weights
- for (int j = 0; j < kernel_cols; j++)
- {
- w[j] = *(inptrs[j]++);
- }
-
- // Compute V = w WT
- V[0] = (w[0]*-1) / 36;
- V[1] = (w[1]*-1 + w[3]*-1 + w[0]*1 + w[2]*1 + w[4]*1) / 48;
- V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1) / 48;
- V[3] = (w[0]*-1 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8) / 120;
- V[4] = (w[0]*-1 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120;
- V[5] = (w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[0]*1) / 720;
- V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[0]*1) / 720;
- V[7] = (w[4]*1) / 1;
-
- // Store the transformed weights
- for (int j = 0; j < inner_tile_cols; j++)
- {
- *(outptr + j*matrix_stride) = V[j];
- }
- outptr++;
- }
- }
- }
-
- template <>
- template <>
- int WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
- {
- (void) shape;
- return 0; // TODO
- }
-
- template <>
- template <>
- void WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>::execute(
- const int n_output_channels,
- const int n_input_channels,
- const float* const input, // NOTE: Data in HWIO order
- float* const output,
- const int matrix_stride,
- const int matrix_row_stride
- )
- {
- // Redirect to the 1xN implementation
- WinogradGEMM<1, 4, 1, 5>::template WeightsTransform<float>::execute(
- n_output_channels, n_input_channels, input, output, matrix_stride,
- matrix_row_stride
- );
- }
-
- template <>
- template <>
- int WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
- {
- (void) shape;
- return 0; // TODO
- }
-
- template struct WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>;
- template struct WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp
deleted file mode 100644
index a56a475..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-namespace winograd
-{
- /* Float implementation for kernel transform F(4x4, 3x3) */
- template <>
- template <>
- void WinogradGEMM<4, 4, 3, 3>::WeightsTransform<float>::execute(
- const int n_output_channels,
- const int n_input_channels,
- const float* const input, // NOTE: Data in HWIO order
- float* const output,
- const int matrix_stride,
- const int matrix_row_stride
- )
- {
- // Get pointers to each cell of the weight tensor
- const auto weight_col_stride = n_input_channels * n_output_channels;
- const auto weight_row_stride = 3 * weight_col_stride;
- const float *inptrs[3][3];
- for (int i = 0; i < 3; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
- }
- }
-
- // For each input channel
- for (int ic = 0; ic < n_input_channels; ic++)
- {
- float *outptr = output + ic * matrix_row_stride;
-
- // For each output channel
- int channels_remaining = n_output_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used and computed in this kernel
- float32x4_t w[3][3], Ww[6][3], V[6][6];
-
- // Read weights
- for (int i = 0; i < 3; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- w[i][j] = vld1q_f32(inptrs[i][j]);
- inptrs[i][j] += 4;
- }
- }
-
- // Compute the matrix W w
- for (int j = 0; j < 3; j++)
- {
- // Ww[0][j] = 6*w[0][j];
- Ww[0][j] = vmulq_n_f32(w[0][j], 6.0);
-
- // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
- Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
-
- // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j];
- Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
-
- // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j];
- Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
- // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j];
- Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
- // Ww[5][j] = 24*w[2][j];
- Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f);
- }
-
- // Compute V = W w WT
- for (int i = 0; i < 6; i++)
- {
- const float recip576 = 1.0f / 576.0f;
-
- // V[i][0] = 6*Ww[i][0];
- V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576);
-
- // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
- V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
-
- // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2];
- V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
-
- // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2];
- V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
- // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2];
- V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
- // V[i][5] = 24*Ww[i][2];
- V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576);
- }
-
- // Store the transformed weights
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- vst1q_f32(outptr + m*matrix_stride, V[i][j]);
- }
- }
- outptr += 4;
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed in this kernel
- float32x2_t w[3][3], Ww[6][3], V[6][6];
-
- // Read weights
- for (int i = 0; i < 3; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- w[i][j] = vld1_f32(inptrs[i][j]);
- inptrs[i][j] += 2;
- }
- }
-
- // Compute the matrix W w
- for (int j = 0; j < 3; j++)
- {
- // Ww[0][j] = 6*w[0][j];
- Ww[0][j] = vmul_n_f32(w[0][j], 6.0);
-
- // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
- Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
-
- // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j];
- Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
-
- // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j];
- Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
- // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j];
- Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
-
- // Ww[5][j] = 24*w[2][j];
- Ww[5][j] = vmul_n_f32(w[2][j], 24.0f);
- }
-
- // Compute V = W w WT
- for (int i = 0; i < 6; i++)
- {
- const float recip576 = 1.0f / 576.0f;
-
- // V[i][0] = 6*Ww[i][0];
- V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576);
-
- // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
- V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
-
- // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2];
- V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
-
- // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2];
- V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
- // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2];
- V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
-
- // V[i][5] = 24*Ww[i][2];
- V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576);
- }
-
- // Store the transformed weights
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- vst1_f32(outptr + m*matrix_stride, V[i][j]);
- }
- }
- outptr += 2;
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed in this kernel
- float w[3][3], Ww[6][3], V[6][6];
-
- // Read weights
- for (int i = 0; i < 3; i++)
- {
- for (int j = 0; j < 3; j++)
- {
- w[i][j] = *(inptrs[i][j]++);
- }
- }
-
- // Compute the matrix W w
- for (int j = 0; j < 3; j++)
- {
- Ww[0][j] = 6*w[0][j];
- Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
- Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j];
- Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j];
- Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j];
- Ww[5][j] = 24*w[2][j];
- }
-
- // Compute V = W w WT
- for (int i = 0; i < 6; i++)
- {
- V[i][0] = ( 6*Ww[i][0]) / 576.0;
- V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
- V[i][2] = (-4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
- V[i][3] = ( 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]) / 576.0;
- V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]) / 576.0;
- V[i][5] = (24*Ww[i][2]) / 576.0;
- }
-
- // Store the transformed weights
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- *(outptr + m*matrix_stride) = V[i][j];
- }
- }
- outptr++;
- }
- }
- }
-
- template <>
- template <>
- int WinogradGEMM<4, 4, 3, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
- {
- const int channel_prod = shape.n_input_channels * shape.n_output_channels;
- return 9 * 16 * channel_prod;
- }
-
- template struct WinogradGEMM<4, 4, 3, 3>::WeightsTransform<float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp
deleted file mode 100644
index c560aa8..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
-
-
-namespace winograd
-{
- template <>
- template <>
- void WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>::execute(
- const int n_output_channels,
- const int n_input_channels,
- const float* const input, // NOTE: Data in HWIO order
- float* const output,
- const int matrix_stride,
- const int matrix_row_stride
- )
- {
- // Get pointers to each cell of the weight tensor
- const auto weight_col_stride = n_input_channels * n_output_channels;
- const float *inptrs[3];
- for (int j = 0; j < 3; j++)
- {
- inptrs[j] = input + j*weight_col_stride;
- }
-
- // For each input channel
- for (int ic = 0; ic < n_input_channels; ic++)
- {
- float *outptr = output + ic * matrix_row_stride;
-
- // For each output channel
- int channels_remaining = n_output_channels;
- for (; channels_remaining; channels_remaining--)
- {
- // Matrices used and computed in this kernel
- float w[3], V[inner_tile_cols];
-
- // Read weights
- for (int j = 0; j < 3; j++)
- {
- w[j] = *(inptrs[j]++);
- }
-
- // Compute V = w WT
- V[0] = (w[0]*-1) / 36.0f;
- V[1] = (w[1]*-1 + w[0]*1 + w[2]*1) / 48.0f;
- V[2] = (w[0]*1 + w[1]*1 + w[2]*1) / 48.0f;
- V[3] = (w[0]*-1 + w[2]*-4 + w[1]*2) / 120.0f;
- V[4] = (w[0]*-1 + w[2]*-4 + w[1]*-2) / 120.0f;
- V[5] = (w[1]*-3 + w[2]*9 + w[0]*1) / 720.0f;
- V[6] = (w[1]*3 + w[2]*9 + w[0]*1) / 720.0f;
- V[7] = (w[2]*1) / 1;
-
- // Store the transformed weights
- for (int j = 0; j < inner_tile_cols; j++)
- {
- *(outptr + j*matrix_stride) = V[j];
- }
- outptr++;
- }
- }
- }
-
- template <>
- template <>
- int WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
- {
- (void) shape;
- return 0; // TODO
- }
-
- template <>
- template <>
- void WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>::execute(
- const int n_output_channels,
- const int n_input_channels,
- const float* const input, // NOTE: Data in HWIO order
- float* const output,
- const int matrix_stride,
- const int matrix_row_stride
- )
- {
- // Redirect to the 1xN implementation
- WinogradGEMM<1, 6, 1, 3>::template WeightsTransform<float>::execute(
- n_output_channels, n_input_channels, input, output, matrix_stride,
- matrix_row_stride
- );
- }
-
- template <>
- template <>
- int WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
- {
- (void) shape;
- return 0; // TODO
- }
-
- template struct WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>;
- template struct WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>;
-}
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/winograd.cpp
similarity index 64%
rename from src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
rename to src/core/NEON/kernels/convolution/winograd/winograd.cpp
index a7de2fd..226f303 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,14 +22,13 @@
* SOFTWARE.
*/
#include <cstring>
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp"
+#include "winograd.hpp"
using namespace winograd;
/** Get the output shape of a convolution. */
-template <int kr, int kc, int itr, int itc>
-template <typename TOut, typename TIn>
-Tensor4DShape WinogradGEMM<kr, kc, itr, itc>::Convolution<TOut, TIn>::get_output_shape(
+template <int kr, int kc, int itr, int itc, WinogradRoots R>
+template <typename TOut, typename TIn, typename TInGEMM, typename TOutGEMM>
+Tensor4DShape WinogradGEMM<kr, kc, itr, itc, R>::Convolution<TOut, TIn, TInGEMM, TOutGEMM>::get_output_shape(
const KernelShape &kernel_shape,
const Tensor4DShape &in_shape,
const PaddingType padding
@@ -47,9 +46,9 @@
/* Get the memory required to transform the kernel.
*/
template <int kernel_rows, int kernel_cols,
- int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_transform_working_size(const KernelShape &shape)
+ int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_transform_working_size(const KernelShape &shape)
{
if (shape.ordering == HWIO)
{
@@ -68,17 +67,17 @@
/** Get the memory required to store the kernel transformed into the
* Winograd domain.
*/
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_storage_size(const KernelShape &shape)
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_storage_size(const KernelShape &shape)
{
return N_GEMMS * get_kernel_matrix_size(shape);
}
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_input_storage_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_input_storage_size(
const KernelShape &kernel_shape,
const Tensor4DShape &input_shape,
const PaddingType padding
@@ -88,9 +87,9 @@
}
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_output_storage_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_output_storage_size(
const KernelShape &kernel_shape,
const Tensor4DShape &input_shape,
const PaddingType padding
@@ -102,9 +101,9 @@
/** Get the memory required to apply a Winograd operator to some input.
*/
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_working_space_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_working_space_size(
const KernelShape &kernel_shape,
const Tensor4DShape &input_shape,
const PaddingType padding_type
@@ -139,20 +138,20 @@
/* Get the memory required by a single "input" matrix.
*/
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_input_matrix_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_input_matrix_size(
const KernelShape &kernel_shape,
const Tensor4DShape &input_shape,
const PaddingType padding_type
)
{
- return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TIn);
+ return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TGIn);
}
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_input_matrix_stride(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_input_matrix_stride(
const KernelShape &kernel_shape,
const Tensor4DShape &input_shape,
const PaddingType padding_type
@@ -171,21 +170,21 @@
/* Get the memory required by a single "output" matrix.
*/
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_output_matrix_size(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_output_matrix_size(
const KernelShape &kernel_shape,
const Tensor4DShape &input_shape,
const PaddingType padding_type
)
{
- return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TOut);
+ return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TGOut);
}
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_output_matrix_stride(
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_output_matrix_stride(
const KernelShape &kernel_shape,
const Tensor4DShape &input_shape,
const PaddingType padding_type
@@ -204,16 +203,16 @@
/* Get the memory required by a single "kernel" matrix.
*/
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_matrix_size(const KernelShape &shape)
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_matrix_size(const KernelShape &shape)
{
- return sizeof(TIn) * get_kernel_matrix_stride(shape);
+ return sizeof(TGIn) * get_kernel_matrix_stride(shape);
}
-template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
-template <typename TOut, typename TIn>
-int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_matrix_stride(const KernelShape &shape)
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols, WinogradRoots roots>
+template <typename TOut, typename TIn, typename TGIn, typename TGOut>
+int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols, roots>::Convolution<TOut, TIn, TGIn, TGOut>::get_kernel_matrix_stride(const KernelShape &shape)
{
const int K = shape.n_input_channels;
const int N = roundup(shape.n_output_channels, N_BLOCK);
@@ -222,19 +221,16 @@
// Instantiate required implementations
-template class WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>;
-template class WinogradGEMM<4, 4, 3, 3>::Convolution<float, float>;
+template class WinogradGEMM<2, 2, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
+template class WinogradGEMM<4, 4, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-template class WinogradGEMM<1, 6, 1, 3>::Convolution<float, float>;
-template class WinogradGEMM<6, 1, 3, 1>::Convolution<float, float>;
+template class WinogradGEMM<1, 6, 1, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>;
+template class WinogradGEMM<6, 1, 3, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-template class WinogradGEMM<2, 2, 5, 5>::Convolution<float, float>;
+template class WinogradGEMM<2, 2, 5, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-template class WinogradGEMM<1, 4, 1, 5>::Convolution<float, float>;
-template class WinogradGEMM<4, 1, 5, 1>::Convolution<float, float>;
+template class WinogradGEMM<1, 4, 1, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>;
+template class WinogradGEMM<4, 1, 5, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
-template class WinogradGEMM<1, 2, 1, 7>::Convolution<float, float>;
-template class WinogradGEMM<2, 1, 7, 1>::Convolution<float, float>;
-
-
-
+template class WinogradGEMM<1, 2, 1, 7, WinogradRoots::Integers>::Convolution<float, float, float, float>;
+template class WinogradGEMM<2, 1, 7, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>;
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp
new file mode 100644
index 0000000..fcbd21f
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+#include "padding.hpp"
+
+#define MEMBERFN(RTYPE) template <\
+ int InnerTileRows, int InnerTileCols,\
+ typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE InputTransform<InnerTileRows, InnerTileCols, TIn, TOut, Roots>
+
+
+#define Nx1MEMBERFN(RTYPE) template <\
+ int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE InputTransform<InnerTileRows, 1, TIn, TOut, Roots>
+
+namespace winograd
+{
+
+MEMBERFN()::InputTransform(
+ const int kernel_rows,
+ const int kernel_cols,
+ const int n_batches,
+ const int n_rows,
+ const int n_cols,
+ const int n_channels,
+ const int padding_top,
+ const int padding_left,
+ const int padding_bottom,
+ const int padding_right
+) : _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), _n_channels(n_channels),
+ _inptr(nullptr), _outptr(nullptr),
+ _overlap_rows(kernel_rows - 1), _overlap_cols(kernel_cols - 1),
+ _padding_top(padding_top), _padding_left(padding_left), _padding_bottom(padding_bottom), _padding_right(padding_right),
+ _tiles_M(iceildiv(padding_top + n_rows + padding_bottom - kernel_rows + 1, InnerTileRows - kernel_rows + 1)),
+ _tiles_N(iceildiv(padding_left + n_cols + padding_right - kernel_cols + 1, InnerTileCols - kernel_cols + 1)),
+ _matrix_stride(0), _matrix_row_stride(0), _matrix_batch_stride(0),
+ _in_col_stride(0), _in_row_stride(0), _in_batch_stride(0),
+ _working_space_col_stride(n_channels),
+ _working_space_row_stride(InnerTileCols * _working_space_col_stride),
+ _working_space(nullptr)
+{
+}
+
+MEMBERFN(void)::set_input_tensor(const void* const inptr)
+{
+ set_input_tensor(inptr, _n_channels);
+}
+
+MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldcol)
+{
+ set_input_tensor(inptr, _n_cols * ldcol, ldcol);
+}
+
+MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldrow, const int ldcol)
+{
+ set_input_tensor(inptr, _n_rows * ldrow, ldrow, ldcol);
+}
+
+MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldbatch, const int ldrow, const int ldcol)
+{
+ _inptr = static_cast<const TIn *>(inptr);
+ _in_batch_stride = ldbatch;
+ _in_row_stride = ldrow;
+ _in_col_stride = ldcol;
+}
+
+MEMBERFN(void)::set_output_matrices(void * const mptr, const int ldmatrix, const int ldrow)
+{
+ _outptr = static_cast<TOut *>(mptr);
+ _matrix_stride = ldmatrix;
+ _matrix_row_stride = ldrow;
+ _matrix_batch_stride = _tiles_M * _tiles_N * ldrow;
+}
+
+Nx1MEMBERFN()::InputTransform(
+ const int kernel_rows,
+ const int kernel_cols,
+ const int n_batches,
+ const int n_rows,
+ const int n_cols,
+ const int n_channels,
+ const int padding_top,
+ const int padding_left,
+ const int padding_bottom,
+ const int padding_right
+) : InputTransform<1, InnerTileRows, TIn, TOut, Roots>::InputTransform(
+ /* Transpose rows and columns */
+ kernel_cols, kernel_rows, n_batches, n_cols, n_rows, n_channels,
+ padding_left, padding_top, padding_right, padding_bottom
+ )
+{
+}
+
+Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr)
+{
+ set_input_tensor(inptr, this->_n_channels);
+}
+
+Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldcol)
+{
+ set_input_tensor(inptr, this->_n_cols * ldcol, ldcol);
+}
+
+Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldrow, const int ldcol)
+{
+ set_input_tensor(inptr, this->_n_rows * ldrow, ldrow, ldcol);
+}
+
+Nx1MEMBERFN(void)::set_input_tensor(const void* const inptr, const int ldbatch, const int ldrow, const int ldcol)
+{
+ // Transpose row and column strides
+ Base::set_input_tensor(inptr, ldbatch, ldcol, ldrow);
+}
+
+MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
+{
+ return sizeof(TIn) * InnerTileRows * _working_space_row_stride * nthreads;
+}
+
+MEMBERFN(void)::set_working_space(void * const buffer)
+{
+ _working_space = static_cast<TIn *>(buffer);
+}
+
+MEMBERFN(unsigned int)::get_window(void) const
+{
+ return iceildiv(_n_channels, WINDOW_BLOCK);
+}
+
+MEMBERFN(void)::run(
+ const unsigned int start,
+ const unsigned int stop,
+ const unsigned int threadid
+)
+{
+ // Determine the channels on which to work
+ if (start >= get_window())
+ {
+ return; // No work to do beyond the end of the window
+ }
+ const unsigned int start_channel = start * WINDOW_BLOCK;
+ const unsigned int stop_channel = std::min<unsigned int>(_n_channels , stop * WINDOW_BLOCK);
+ const unsigned int n_channels = stop_channel - start_channel;
+
+ // Loop over batches
+ for (int batch = 0; batch < _n_batches; batch++)
+ {
+ const TIn* const inptr_batch = _inptr + start_channel + batch*_in_batch_stride;
+ TOut* const outptr_batch = _outptr + start_channel + batch*_matrix_batch_stride;
+
+ // Loop over rows of tiles
+ for (int tile_i = 0; tile_i < _tiles_M; tile_i++)
+ {
+ // Compute the starting and ending row of pixels within the row of tiles,
+ // hence compute the padding to apply to the top and bottom of each tile.
+ const int row_top = tile_i * (InnerTileRows - _overlap_rows) - _padding_top;
+ const int row_bottom = row_top + InnerTileRows;
+ const int row_pad_top = std::max(0, _padding_top - tile_i * (InnerTileRows - _overlap_rows));
+ const int row_pad_bottom = std::max(0, row_bottom - _n_rows);
+
+ // Get a pointer to the start of the row.
+ const int row_offset = std::min(0, row_pad_top - _padding_top);
+ const TIn* const inptr_row = inptr_batch + _in_row_stride*(row_offset + tile_i*(InnerTileRows - _overlap_rows));
+ TOut* const outptr_row = outptr_batch + tile_i*_tiles_N*_matrix_row_stride;
+
+ // Loop over tiles within the row
+ for (int tile_j = 0; tile_j < _tiles_N; tile_j++)
+ {
+ // Compute the starting and ending column of pixels within the tile,
+ // hence compute the padding to apply to the left and right of the
+ // tile.
+ const int tile_left = tile_j * (InnerTileCols - _overlap_cols) - _padding_left;
+ const int tile_right = tile_left + InnerTileCols;
+ const int tile_pad_left = std::max(0, _padding_left - tile_j * (InnerTileCols - _overlap_cols));
+ const int tile_pad_right = std::max(0, tile_right - _n_cols);
+
+ // Get a pointer to the start of the tile.
+ const int col_offset = std::min(0, tile_pad_left - _padding_left);
+ const TIn* const inptr_tile = inptr_row + _in_col_stride*(col_offset + tile_j*(InnerTileCols - _overlap_cols));
+ TOut* const outptr_tile = outptr_row + tile_j * _matrix_row_stride;
+
+ // Transform the tile, applying padding if necessary.
+ if (row_pad_top || tile_pad_left || row_pad_bottom || tile_pad_right)
+ {
+ transform_padded_tile(
+ threadid, n_channels, outptr_tile, inptr_tile,
+ row_pad_top, tile_pad_left, row_pad_bottom, tile_pad_right
+ );
+ }
+ else
+ {
+ transform_unpadded_tile(threadid, n_channels, outptr_tile, inptr_tile);
+ }
+ }
+ }
+ }
+}
+
+MEMBERFN(void)::transform_unpadded_tile(
+ const unsigned int /* threadid unused */,
+ const int n_channels,
+ TOut * const outptr,
+ const TIn * const inptr
+)
+{
+ transform_tile(
+ n_channels, inptr, _in_row_stride, _in_col_stride, outptr, _matrix_stride
+ );
+}
+
+MEMBERFN(void)::transform_padded_tile(
+ const unsigned int threadid,
+ const int n_channels,
+ TOut * const outptr,
+ const TIn * const inptr,
+ const int padding_top,
+ const int padding_left,
+ const int padding_bottom,
+ const int padding_right
+)
+{
+ padding::copy_and_pad_tile(
+ InnerTileRows, InnerTileCols, n_channels,
+ inptr, _in_row_stride, _in_col_stride,
+ static_cast<TIn *>(get_working_space(threadid)), _working_space_row_stride, _working_space_col_stride,
+ padding_top, padding_left, padding_bottom, padding_right
+ );
+
+ transform_tile(
+ n_channels, static_cast<const TIn *>(get_working_space(threadid)),
+ _working_space_row_stride, _working_space_col_stride,
+ outptr, _matrix_stride
+ );
+}
+
+MEMBERFN(void *)::get_working_space(const unsigned int threadid) const
+{
+ return _working_space + InnerTileRows * _working_space_row_stride * threadid;
+}
+
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..5040ec1
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "input.hpp"
+
+namespace winograd
+{
+
+template <>
+void InputTransform<1, 8, float, float, WinogradRoots::Integers>::transform_tile(
+ const int n_channels,
+ const float* const input_base,
+ const int, // We don't need to stride over rows
+ const int input_col_stride,
+ float* outptr,
+ const int matrix_stride
+)
+{
+ constexpr int inner_tile_cols = 8;
+
+ // Get pointers into the input tile
+ const float *x_ptrs[inner_tile_cols];
+ for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+ {
+ x_ptrs[j] = input_base + xj*input_col_stride;
+ }
+
+ // Vectors used/computed in this kernel.
+ float x[inner_tile_cols];
+ float U[inner_tile_cols];
+
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[j] = 0.0f;
+ }
+
+ // Perform the Winograd input transformation for each channel in the input
+ // tensor.
+ int channels_remaining = n_channels;
+#ifdef _arm_any_
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ float32x4_t x[inner_tile_cols], U[inner_tile_cols];
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[j] = vdupq_n_f32(0.0f);
+ }
+
+ // Load x
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[j] = vld1q_f32(x_ptrs[j]);
+ x_ptrs[j] += 4;
+ }
+
+ // Compute U = x . X
+ U[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+ U[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+ U[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+ U[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+ U[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+ U[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+ U[6] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+ U[7] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+ // Store the transformed vector
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ vst1q_f32(outptr + j*matrix_stride, U[j]);
+ }
+ outptr += 4;
+ }
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ float32x2_t x[inner_tile_cols], U[inner_tile_cols];
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[j] = vdup_n_f32(0.0f);
+ }
+
+ // Load x
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[j] = vld1_f32(x_ptrs[j]);
+ x_ptrs[j] += 2;
+ }
+
+ // Compute U = x . X
+ U[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+ U[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+ U[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+ U[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+ U[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+ U[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+ U[6] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+ U[7] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+ // Store the transformed vector
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ vst1_f32(outptr + j*matrix_stride, U[j]);
+ }
+ outptr += 2;
+ }
+#endif // _arm_any_
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load x
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[j] = *(x_ptrs[j]++);
+ }
+
+ // Compute U = x . X
+ U[0] = x[0]*-36 + x[4]*-14 + x[2]*49 + x[6]*1;
+ U[1] = x[5]*-1 + x[1]*-36 + x[4]*-13 + x[3]*13 + x[2]*36 + x[6]*1;
+ U[2] = x[3]*-13 + x[4]*-13 + x[1]*36 + x[2]*36 + x[5]*1 + x[6]*1;
+ U[3] = x[1]*-18 + x[4]*-10 + x[5]*-2 + x[2]*9 + x[3]*20 + x[6]*1;
+ U[4] = x[3]*-20 + x[4]*-10 + x[5]*2 + x[2]*9 + x[1]*18 + x[6]*1;
+ U[5] = x[1]*-12 + x[4]*-5 + x[5]*-3 + x[2]*4 + x[3]*15 + x[6]*1;
+ U[6] = x[3]*-15 + x[4]*-5 + x[5]*3 + x[2]*4 + x[1]*12 + x[6]*1;
+ U[7] = x[1]*-36 + x[5]*-14 + x[3]*49 + x[7]*1;
+
+ // Store the transformed vector
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ *(outptr + j*matrix_stride) = U[j];
+ }
+ outptr++;
+ }
+}
+
+template class InputTransform<1, 8, float, float, WinogradRoots::Integers>;
+template class InputTransform<8, 1, float, float, WinogradRoots::Integers>;
+
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..9393785
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "input.hpp"
+#include "arm.hpp"
+
+namespace winograd
+{
+
+template <>
+void InputTransform<4, 4, float, float, WinogradRoots::Integers>::transform_tile(
+ const int n_channels,
+ const float* const input_base,
+ const int input_row_stride,
+ const int input_col_stride,
+ float* outptr,
+ const int matrix_stride
+)
+{
+ constexpr int inner_tile_rows = 4, inner_tile_cols = 4;
+
+ // Get pointers into the input tile
+ const float *x_ptrs[inner_tile_rows][inner_tile_cols];
+ for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
+ {
+ // Get a pointer into the row
+ const float* const row_ptr = input_base + xi*input_row_stride;
+
+ for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+ {
+ x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+ }
+ }
+
+ // Matrices used/computed in this kernel.
+ float x[inner_tile_rows][inner_tile_cols];
+ float XTx[inner_tile_rows][inner_tile_cols];
+ float U[inner_tile_rows][inner_tile_cols];
+
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = XTx[i][j] = 0.0f;
+ }
+ }
+
+ // Perform the Winograd input transformation for each channel in the input
+ // tensor.
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used/computed in this kernel.
+ float32x4_t x[inner_tile_rows][inner_tile_cols];
+ float32x4_t XTx[inner_tile_rows][inner_tile_cols];
+ float32x4_t U[inner_tile_rows][inner_tile_cols];
+
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = vdupq_n_f32(0.0f);
+ XTx[i][j] = vdupq_n_f32(0.0f);
+ }
+ }
+
+ // Load x
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = vld1q_f32(x_ptrs[i][j]);
+ x_ptrs[i][j] += 4;
+ }
+ }
+
+ // Compute XT . x
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ // XTx[0][j] = x[0][j] - x[2][j];
+ XTx[0][j] = vsubq_f32(x[0][j], x[2][j]);
+
+ // XTx[1][j] = x[1][j] + x[2][j];
+ XTx[1][j] = vaddq_f32(x[1][j], x[2][j]);
+
+ // XTx[2][j] = x[2][j] - x[1][j];
+ XTx[2][j] = vsubq_f32(x[2][j], x[1][j]);
+
+ // XTx[3][j] = x[1][j] - x[3][j];
+ XTx[3][j] = vsubq_f32(x[1][j], x[3][j]);
+ }
+
+ // Compute U = XT . x . X
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ // U[i][0] = XTx[i][0] - XTx[i][2];
+ U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]);
+
+ // U[i][1] = XTx[i][1] + XTx[i][2];
+ U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]);
+
+ // U[i][2] = XTx[i][2] - XTx[i][1];
+ U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]);
+
+ // U[i][3] = XTx[i][1] - XTx[i][3];
+ U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]);
+ }
+
+ // Store the transformed matrix
+ for (int i = 0, m = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++, m++)
+ {
+ vst1q_f32(outptr + m*matrix_stride, U[i][j]);
+ }
+ }
+ outptr += 4;
+ }
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used/computed in this kernel.
+ float32x2_t x[inner_tile_rows][inner_tile_cols];
+ float32x2_t XTx[inner_tile_rows][inner_tile_cols];
+ float32x2_t U[inner_tile_rows][inner_tile_cols];
+
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = vdup_n_f32(0.0f);
+ XTx[i][j] = vdup_n_f32(0.0f);
+ }
+ }
+
+ // Load x
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = vld1_f32(x_ptrs[i][j]);
+ x_ptrs[i][j] += 2;
+ }
+ }
+
+ // Compute XT . x
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ // XTx[0][j] = x[0][j] - x[2][j];
+ XTx[0][j] = vsub_f32(x[0][j], x[2][j]);
+
+ // XTx[1][j] = x[1][j] + x[2][j];
+ XTx[1][j] = vadd_f32(x[1][j], x[2][j]);
+
+ // XTx[2][j] = x[2][j] - x[1][j];
+ XTx[2][j] = vsub_f32(x[2][j], x[1][j]);
+
+ // XTx[3][j] = x[1][j] - x[3][j];
+ XTx[3][j] = vsub_f32(x[1][j], x[3][j]);
+ }
+
+ // Compute U = XT . x . X
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ // U[i][0] = XTx[i][0] - XTx[i][2];
+ U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]);
+
+ // U[i][1] = XTx[i][1] + XTx[i][2];
+ U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]);
+
+ // U[i][2] = XTx[i][2] - XTx[i][1];
+ U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]);
+
+ // U[i][3] = XTx[i][1] - XTx[i][3];
+ U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]);
+ }
+
+ // Store the transformed matrix
+ for (int i = 0, m = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++, m++)
+ {
+ vst1_f32(outptr + m*matrix_stride, U[i][j]);
+ }
+ }
+ outptr += 2;
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load x
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = *(x_ptrs[i][j]++);
+ }
+ }
+
+ // Compute XT . x
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ XTx[0][j] = x[0][j] - x[2][j];
+ XTx[1][j] = x[1][j] + x[2][j];
+ XTx[2][j] = x[2][j] - x[1][j];
+ XTx[3][j] = x[1][j] - x[3][j];
+ }
+
+ // Compute U = XT . x . X
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ U[i][0] = XTx[i][0] - XTx[i][2];
+ U[i][1] = XTx[i][1] + XTx[i][2];
+ U[i][2] = XTx[i][2] - XTx[i][1];
+ U[i][3] = XTx[i][1] - XTx[i][3];
+ }
+
+ // Store the transformed matrix
+ for (int i = 0, m = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++, m++)
+ {
+ *(outptr + m*matrix_stride) = U[i][j];
+ }
+ }
+ outptr++;
+ }
+}
+
+template class InputTransform<4, 4, float, float, WinogradRoots::Integers>;
+
+} // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..908fc82
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
@@ -0,0 +1,1308 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "input.hpp"
+
+namespace winograd
+{
+
+#ifdef __aarch64__
+
+template <>
+void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
+ int n_channels,
+ const float* input_base,
+ const int input_row_stride,
+ const int input_col_stride,
+ float* matrix_base,
+ const int matrix_stride
+)
+{
+ const float pcoeffs[4] = {1.0f, 2.0f, 4.0f, 5.0f};
+ __asm__ __volatile__(
+ "ldr q0, [%[pcoeffs]]\n"
+ "add x25, %[inptr0], %[input_row_stride]\n"
+ "add x18, %[input_col_stride1], %[input_col_stride1]\n"
+ "add x16, x25, %[input_row_stride]\n"
+ "add x19, x18, %[input_col_stride1]\n"
+ "add x26, x16, %[input_row_stride]\n"
+ "add x20, x19, %[input_col_stride1]\n"
+ "add x17, x26, %[input_row_stride]\n"
+ "add x21, x20, %[input_col_stride1]\n"
+ "add x27, x17, %[input_row_stride]\n"
+ "add x28, %[outptr0], %[output_row_stride]\n"
+ "add x11, %[output_col_stride1], %[output_col_stride1]\n"
+ "add x22, x28, %[output_row_stride]\n"
+ "add x13, x11, %[output_col_stride1]\n"
+ "add x12, x22, %[output_row_stride]\n"
+ "add x23, x13, %[output_col_stride1]\n"
+ "add x14, x12, %[output_row_stride]\n"
+ "add x15, x23, %[output_col_stride1]\n"
+ "add x24, x14, %[output_row_stride]\n"
+ "cmp %w[n_channels], #4\n"
+ "blt 2f\n"
+ "1:\n"
+ "ldr q8, [%[inptr0], x20]\n"
+ "ldr q2, [%[inptr0], x18]\n"
+ "mov v14.16b, v8.16b\n"
+ "ldr q9, [%[inptr0]]\n"
+ "mov v10.16b, v8.16b\n"
+ "ldr q1, [%[inptr0], x21]\n"
+ "fmla v14.4s, v9.4s, v0.s[2]\n"
+ "ldr q4, [%[inptr0], x19]\n"
+ "mov v9.16b, v8.16b\n"
+ "ldr q12, [%[inptr0], %[input_col_stride1]]\n"
+ "fmls v10.4s, v12.4s, v0.s[2]\n"
+ "ldr q5, [x16, x20]\n"
+ "fmls v14.4s, v2.4s, v0.s[3]\n"
+ "ldr q20, [x16, x18]\n"
+ "fmla v9.4s, v12.4s, v0.s[2]\n"
+ "ldr q3, [x16]\n"
+ "fmls v10.4s, v2.4s, v0.s[2]\n"
+ "ldr q6, [x16, x21]\n"
+ "mov v7.16b, v8.16b\n"
+ "ldr q16, [x16, x19]\n"
+ "fmls v9.4s, v2.4s, v0.s[2]\n"
+ "ldr q22, [x16, %[input_col_stride1]]\n"
+ "fadd v10.4s, v10.4s, v4.4s\n"
+ "ldr q17, [x17, x20]\n"
+ "fmls v7.4s, v12.4s, v0.s[1]\n"
+ "ldr q15, [x17, x18]\n"
+ "fsub v9.4s, v9.4s, v4.4s\n"
+ "ldr q19, [x17]\n"
+ "mov v8.16b, v8.16b\n"
+ "ldr q18, [x17, x21]\n"
+ "fsub v7.4s, v7.4s, v2.4s\n"
+ "ldr q13, [x17, x19]\n"
+ "fmla v7.4s, v4.4s, v0.s[1]\n"
+ "ldr q21, [x17, %[input_col_stride1]]\n"
+ "fmla v8.4s, v12.4s, v0.s[1]\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "mov v11.16b, v1.16b\n"
+ "add x16, x16, #16\n"
+ "mov v1.16b, v5.16b\n"
+ "add x17, x17, #16\n"
+ "fsub v8.4s, v8.4s, v2.4s\n"
+ "fmla v11.4s, v12.4s, v0.s[2]\n"
+ "fmls v8.4s, v4.4s, v0.s[1]\n"
+ "fmla v1.4s, v3.4s, v0.s[2]\n"
+ "mov v2.16b, v5.16b\n"
+ "mov v3.16b, v5.16b\n"
+ "fmls v11.4s, v4.4s, v0.s[3]\n"
+ "mov v4.16b, v5.16b\n"
+ "fmls v1.4s, v20.4s, v0.s[3]\n"
+ "fmls v2.4s, v22.4s, v0.s[2]\n"
+ "fmla v3.4s, v22.4s, v0.s[2]\n"
+ "fmls v4.4s, v22.4s, v0.s[1]\n"
+ "mov v5.16b, v5.16b\n"
+ "mov v6.16b, v6.16b\n"
+ "fmls v2.4s, v20.4s, v0.s[2]\n"
+ "mov v12.16b, v17.16b\n"
+ "fmls v3.4s, v20.4s, v0.s[2]\n"
+ "fsub v4.4s, v4.4s, v20.4s\n"
+ "fmla v4.4s, v16.4s, v0.s[1]\n"
+ "fmla v5.4s, v22.4s, v0.s[1]\n"
+ "fadd v2.4s, v2.4s, v16.4s\n"
+ "fmla v6.4s, v22.4s, v0.s[2]\n"
+ "fsub v3.4s, v3.4s, v16.4s\n"
+ "fmla v12.4s, v19.4s, v0.s[2]\n"
+ "fsub v5.4s, v5.4s, v20.4s\n"
+ "mov v19.16b, v17.16b\n"
+ "fmls v5.4s, v16.4s, v0.s[1]\n"
+ "fmls v6.4s, v16.4s, v0.s[3]\n"
+ "fmls v12.4s, v15.4s, v0.s[3]\n"
+ "fmls v19.4s, v21.4s, v0.s[2]\n"
+ "mov v20.16b, v17.16b\n"
+ "mov v16.16b, v17.16b\n"
+ "mov v17.16b, v17.16b\n"
+ "mov v18.16b, v18.16b\n"
+ "fmls v19.4s, v15.4s, v0.s[2]\n"
+ "fmla v20.4s, v21.4s, v0.s[2]\n"
+ "fmls v16.4s, v21.4s, v0.s[1]\n"
+ "fmla v17.4s, v21.4s, v0.s[1]\n"
+ "fmla v18.4s, v21.4s, v0.s[2]\n"
+ "mov v23.16b, v12.16b\n"
+ "fadd v19.4s, v19.4s, v13.4s\n"
+ "fmls v20.4s, v15.4s, v0.s[2]\n"
+ "fsub v16.4s, v16.4s, v15.4s\n"
+ "fsub v17.4s, v17.4s, v15.4s\n"
+ "fmla v16.4s, v13.4s, v0.s[1]\n"
+ "fmls v17.4s, v13.4s, v0.s[1]\n"
+ "fsub v20.4s, v20.4s, v13.4s\n"
+ "fmls v18.4s, v13.4s, v0.s[3]\n"
+ "fmla v23.4s, v14.4s, v0.s[2]\n"
+ "mov v15.16b, v19.16b\n"
+ "mov v14.16b, v20.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "fmla v15.4s, v10.4s, v0.s[2]\n"
+ "mov v10.16b, v17.16b\n"
+ "fmls v23.4s, v1.4s, v0.s[3]\n"
+ "fmla v14.4s, v9.4s, v0.s[2]\n"
+ "fmla v24.4s, v7.4s, v0.s[2]\n"
+ "fmla v10.4s, v8.4s, v0.s[2]\n"
+ "fmls v15.4s, v2.4s, v0.s[3]\n"
+ "mov v7.16b, v18.16b\n"
+ "str q23, [%[outptr0]]\n"
+ "fmls v14.4s, v3.4s, v0.s[3]\n"
+ "fmls v24.4s, v4.4s, v0.s[3]\n"
+ "fmls v10.4s, v5.4s, v0.s[3]\n"
+ "str q15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v7.4s, v11.4s, v0.s[2]\n"
+ "str q14, [%[outptr0], x11]\n"
+ "str q24, [%[outptr0], x13]\n"
+ "str q10, [%[outptr0], x23]\n"
+ "fmls v7.4s, v6.4s, v0.s[3]\n"
+ "str q7, [%[outptr0], x15]\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "mov v26.16b, v12.16b\n"
+ "mov v25.16b, v19.16b\n"
+ "ldr q11, [x25, x20]\n"
+ "mov v10.16b, v11.16b\n"
+ "ldr q23, [x25, x18]\n"
+ "mov v9.16b, v11.16b\n"
+ "ldr q7, [x25]\n"
+ "fmla v10.4s, v7.4s, v0.s[2]\n"
+ "ldr q13, [x25, x21]\n"
+ "mov v7.16b, v11.16b\n"
+ "ldr q31, [x25, x19]\n"
+ "mov v8.16b, v11.16b\n"
+ "ldr q21, [x25, %[input_col_stride1]]\n"
+ "fmls v10.4s, v23.4s, v0.s[3]\n"
+ "ldr q30, [x26, x20]\n"
+ "fmls v9.4s, v21.4s, v0.s[2]\n"
+ "ldr q29, [x26, x18]\n"
+ "fmla v7.4s, v21.4s, v0.s[2]\n"
+ "ldr q22, [x26]\n"
+ "fmls v8.4s, v21.4s, v0.s[1]\n"
+ "ldr q24, [x26, x21]\n"
+ "fmls v9.4s, v23.4s, v0.s[2]\n"
+ "ldr q27, [x26, x19]\n"
+ "fmls v7.4s, v23.4s, v0.s[2]\n"
+ "ldr q28, [x26, %[input_col_stride1]]\n"
+ "fsub v8.4s, v8.4s, v23.4s\n"
+ "add x25, x25, #16\n"
+ "fadd v9.4s, v9.4s, v31.4s\n"
+ "add x26, x26, #16\n"
+ "fsub v7.4s, v7.4s, v31.4s\n"
+ "fmla v8.4s, v31.4s, v0.s[1]\n"
+ "mov v11.16b, v11.16b\n"
+ "mov v15.16b, v13.16b\n"
+ "mov v14.16b, v30.16b\n"
+ "mov v13.16b, v30.16b\n"
+ "fmla v11.4s, v21.4s, v0.s[1]\n"
+ "fmla v15.4s, v21.4s, v0.s[2]\n"
+ "fmla v14.4s, v22.4s, v0.s[2]\n"
+ "fmls v13.4s, v28.4s, v0.s[2]\n"
+ "mov v21.16b, v30.16b\n"
+ "mov v22.16b, v30.16b\n"
+ "fsub v11.4s, v11.4s, v23.4s\n"
+ "fmls v15.4s, v31.4s, v0.s[3]\n"
+ "fmls v11.4s, v31.4s, v0.s[1]\n"
+ "fmls v14.4s, v29.4s, v0.s[3]\n"
+ "fmls v13.4s, v29.4s, v0.s[2]\n"
+ "fmla v21.4s, v28.4s, v0.s[2]\n"
+ "fmls v22.4s, v28.4s, v0.s[1]\n"
+ "mov v23.16b, v30.16b\n"
+ "mov v24.16b, v24.16b\n"
+ "fmls v26.4s, v10.4s, v0.s[2]\n"
+ "fadd v13.4s, v13.4s, v27.4s\n"
+ "fmls v21.4s, v29.4s, v0.s[2]\n"
+ "fsub v22.4s, v22.4s, v29.4s\n"
+ "fmla v23.4s, v28.4s, v0.s[1]\n"
+ "fmla v22.4s, v27.4s, v0.s[1]\n"
+ "fmla v24.4s, v28.4s, v0.s[2]\n"
+ "fsub v21.4s, v21.4s, v27.4s\n"
+ "fmls v26.4s, v1.4s, v0.s[2]\n"
+ "fsub v23.4s, v23.4s, v29.4s\n"
+ "fmls v25.4s, v9.4s, v0.s[2]\n"
+ "fmls v23.4s, v27.4s, v0.s[1]\n"
+ "fmls v24.4s, v27.4s, v0.s[3]\n"
+ "fadd v26.4s, v26.4s, v14.4s\n"
+ "mov v27.16b, v20.16b\n"
+ "str q26, [x28]\n"
+ "fmls v25.4s, v2.4s, v0.s[2]\n"
+ "fmls v27.4s, v7.4s, v0.s[2]\n"
+ "mov v31.16b, v16.16b\n"
+ "mov v30.16b, v17.16b\n"
+ "mov v29.16b, v18.16b\n"
+ "fadd v25.4s, v25.4s, v13.4s\n"
+ "fmls v31.4s, v8.4s, v0.s[2]\n"
+ "str q25, [x28, %[output_col_stride1]]\n"
+ "fmls v27.4s, v3.4s, v0.s[2]\n"
+ "fmls v30.4s, v11.4s, v0.s[2]\n"
+ "fmls v29.4s, v15.4s, v0.s[2]\n"
+ "fmls v31.4s, v4.4s, v0.s[2]\n"
+ "mov v26.16b, v12.16b\n"
+ "fadd v27.4s, v27.4s, v21.4s\n"
+ "mov v25.16b, v19.16b\n"
+ "str q27, [x28, x11]\n"
+ "fmls v30.4s, v5.4s, v0.s[2]\n"
+ "fadd v31.4s, v31.4s, v22.4s\n"
+ "fmls v29.4s, v6.4s, v0.s[2]\n"
+ "str q31, [x28, x13]\n"
+ "fmla v26.4s, v10.4s, v0.s[2]\n"
+ "fadd v30.4s, v30.4s, v23.4s\n"
+ "fmla v25.4s, v9.4s, v0.s[2]\n"
+ "str q30, [x28, x23]\n"
+ "fadd v29.4s, v29.4s, v24.4s\n"
+ "str q29, [x28, x15]\n"
+ "fmls v26.4s, v1.4s, v0.s[2]\n"
+ "fmls v25.4s, v2.4s, v0.s[2]\n"
+ "add x28, x28, #16\n"
+ "mov v30.16b, v20.16b\n"
+ "mov v29.16b, v16.16b\n"
+ "fsub v26.4s, v26.4s, v14.4s\n"
+ "mov v28.16b, v17.16b\n"
+ "str q26, [x22]\n"
+ "fsub v25.4s, v25.4s, v13.4s\n"
+ "str q25, [x22, %[output_col_stride1]]\n"
+ "fmla v30.4s, v7.4s, v0.s[2]\n"
+ "fmla v29.4s, v8.4s, v0.s[2]\n"
+ "fmla v28.4s, v11.4s, v0.s[2]\n"
+ "mov v26.16b, v18.16b\n"
+ "mov v25.16b, v12.16b\n"
+ "fmls v30.4s, v3.4s, v0.s[2]\n"
+ "mov v31.16b, v19.16b\n"
+ "fmls v29.4s, v4.4s, v0.s[2]\n"
+ "fmls v28.4s, v5.4s, v0.s[2]\n"
+ "fmla v26.4s, v15.4s, v0.s[2]\n"
+ "fmls v25.4s, v10.4s, v0.s[1]\n"
+ "fsub v30.4s, v30.4s, v21.4s\n"
+ "fmls v31.4s, v9.4s, v0.s[1]\n"
+ "str q30, [x22, x11]\n"
+ "fsub v29.4s, v29.4s, v22.4s\n"
+ "str q29, [x22, x13]\n"
+ "fsub v28.4s, v28.4s, v23.4s\n"
+ "str q28, [x22, x23]\n"
+ "fmls v26.4s, v6.4s, v0.s[2]\n"
+ "fsub v25.4s, v25.4s, v1.4s\n"
+ "fsub v31.4s, v31.4s, v2.4s\n"
+ "fmla v25.4s, v14.4s, v0.s[1]\n"
+ "fmla v31.4s, v13.4s, v0.s[1]\n"
+ "fsub v26.4s, v26.4s, v24.4s\n"
+ "mov v27.16b, v20.16b\n"
+ "str q26, [x22, x15]\n"
+ "mov v26.16b, v16.16b\n"
+ "str q25, [x12]\n"
+ "fmls v27.4s, v7.4s, v0.s[1]\n"
+ "str q31, [x12, %[output_col_stride1]]\n"
+ "fmls v26.4s, v8.4s, v0.s[1]\n"
+ "mov v25.16b, v17.16b\n"
+ "add x22, x22, #16\n"
+ "fsub v27.4s, v27.4s, v3.4s\n"
+ "mov v28.16b, v18.16b\n"
+ "fmla v27.4s, v21.4s, v0.s[1]\n"
+ "fsub v26.4s, v26.4s, v4.4s\n"
+ "fmla v26.4s, v22.4s, v0.s[1]\n"
+ "fmls v25.4s, v11.4s, v0.s[1]\n"
+ "fmls v28.4s, v15.4s, v0.s[1]\n"
+ "mov v12.16b, v12.16b\n"
+ "str q27, [x12, x11]\n"
+ "mov v19.16b, v19.16b\n"
+ "str q26, [x12, x13]\n"
+ "fsub v25.4s, v25.4s, v5.4s\n"
+ "fmla v25.4s, v23.4s, v0.s[1]\n"
+ "fsub v28.4s, v28.4s, v6.4s\n"
+ "fmla v28.4s, v24.4s, v0.s[1]\n"
+ "fmla v12.4s, v10.4s, v0.s[1]\n"
+ "fmla v19.4s, v9.4s, v0.s[1]\n"
+ "mov v20.16b, v20.16b\n"
+ "str q25, [x12, x23]\n"
+ "mov v16.16b, v16.16b\n"
+ "str q28, [x12, x15]\n"
+ "fsub v12.4s, v12.4s, v1.4s\n"
+ "fmls v12.4s, v14.4s, v0.s[1]\n"
+ "add x12, x12, #16\n"
+ "fsub v19.4s, v19.4s, v2.4s\n"
+ "fmla v20.4s, v7.4s, v0.s[1]\n"
+ "fmls v19.4s, v13.4s, v0.s[1]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "str q12, [x14]\n"
+ "mov v1.16b, v17.16b\n"
+ "fsub v20.4s, v20.4s, v3.4s\n"
+ "mov v17.16b, v18.16b\n"
+ "str q19, [x14, %[output_col_stride1]]\n"
+ "fmls v20.4s, v21.4s, v0.s[1]\n"
+ "fsub v16.4s, v16.4s, v4.4s\n"
+ "fmla v1.4s, v11.4s, v0.s[1]\n"
+ "fmls v16.4s, v22.4s, v0.s[1]\n"
+ "fmla v17.4s, v15.4s, v0.s[1]\n"
+ "str q20, [x14, x11]\n"
+ "fsub v1.4s, v1.4s, v5.4s\n"
+ "str q16, [x14, x13]\n"
+ "fmls v1.4s, v23.4s, v0.s[1]\n"
+ "fsub v17.4s, v17.4s, v6.4s\n"
+ "fmls v17.4s, v24.4s, v0.s[1]\n"
+ "str q1, [x14, x23]\n"
+ "str q17, [x14, x15]\n"
+ "add x14, x14, #16\n"
+ "ldr q2, [x27, x20]\n"
+ "mov v4.16b, v2.16b\n"
+ "ldr q17, [x27, x18]\n"
+ "mov v12.16b, v2.16b\n"
+ "ldr q18, [x27]\n"
+ "fmla v4.4s, v18.4s, v0.s[2]\n"
+ "ldr q3, [x27, x21]\n"
+ "mov v6.16b, v2.16b\n"
+ "ldr q5, [x27, x19]\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr q18, [x27, %[input_col_stride1]]\n"
+ "fmls v4.4s, v17.4s, v0.s[3]\n"
+ "add x27, x27, #16\n"
+ "fmls v12.4s, v18.4s, v0.s[2]\n"
+ "sub %w[n_channels], %w[n_channels], #4\n"
+ "fmla v6.4s, v18.4s, v0.s[2]\n"
+ "cmp %w[n_channels], #4\n"
+ "fmls v1.4s, v18.4s, v0.s[1]\n"
+ "mov v2.16b, v2.16b\n"
+ "fmls v12.4s, v17.4s, v0.s[2]\n"
+ "mov v3.16b, v3.16b\n"
+ "fmls v6.4s, v17.4s, v0.s[2]\n"
+ "fmla v2.4s, v18.4s, v0.s[1]\n"
+ "fsub v1.4s, v1.4s, v17.4s\n"
+ "fmla v3.4s, v18.4s, v0.s[2]\n"
+ "fadd v12.4s, v12.4s, v5.4s\n"
+ "fmla v1.4s, v5.4s, v0.s[1]\n"
+ "fsub v6.4s, v6.4s, v5.4s\n"
+ "fsub v2.4s, v2.4s, v17.4s\n"
+ "fmls v2.4s, v5.4s, v0.s[1]\n"
+ "fmls v3.4s, v5.4s, v0.s[3]\n"
+ "mov v4.16b, v4.16b\n"
+ "mov v16.16b, v12.16b\n"
+ "mov v5.16b, v6.16b\n"
+ "mov v6.16b, v1.16b\n"
+ "fmla v4.4s, v10.4s, v0.s[2]\n"
+ "fmla v16.4s, v9.4s, v0.s[2]\n"
+ "fmla v5.4s, v7.4s, v0.s[2]\n"
+ "fmla v6.4s, v8.4s, v0.s[2]\n"
+ "mov v9.16b, v2.16b\n"
+ "mov v10.16b, v3.16b\n"
+ "fmls v4.4s, v14.4s, v0.s[3]\n"
+ "fmls v16.4s, v13.4s, v0.s[3]\n"
+ "fmls v5.4s, v21.4s, v0.s[3]\n"
+ "fmls v6.4s, v22.4s, v0.s[3]\n"
+ "fmla v9.4s, v11.4s, v0.s[2]\n"
+ "fmla v10.4s, v15.4s, v0.s[2]\n"
+ "str q4, [x24]\n"
+ "str q16, [x24, %[output_col_stride1]]\n"
+ "str q5, [x24, x11]\n"
+ "str q6, [x24, x13]\n"
+ "fmls v9.4s, v23.4s, v0.s[3]\n"
+ "fmls v10.4s, v24.4s, v0.s[3]\n"
+ "str q9, [x24, x23]\n"
+ "str q10, [x24, x15]\n"
+ "add x24, x24, #16\n"
+ "bge 1b\n"
+ "2:\n"
+ "cmp %w[n_channels], #2\n"
+ "blt 3f\n"
+ "ldr d8, [%[inptr0], x20]\n"
+ "mov v14.16b, v8.16b\n"
+ "ldr d2, [%[inptr0], x18]\n"
+ "mov v10.16b, v8.16b\n"
+ "ldr d9, [%[inptr0]]\n"
+ "fmla v14.4s, v9.4s, v0.s[2]\n"
+ "ldr d1, [%[inptr0], x21]\n"
+ "mov v9.16b, v8.16b\n"
+ "ldr d4, [%[inptr0], x19]\n"
+ "mov v7.16b, v8.16b\n"
+ "ldr d12, [%[inptr0], %[input_col_stride1]]\n"
+ "fmls v14.4s, v2.4s, v0.s[3]\n"
+ "ldr d5, [x16, x20]\n"
+ "fmls v10.4s, v12.4s, v0.s[2]\n"
+ "ldr d20, [x16, x18]\n"
+ "fmla v9.4s, v12.4s, v0.s[2]\n"
+ "ldr d3, [x16]\n"
+ "fmls v7.4s, v12.4s, v0.s[1]\n"
+ "ldr d6, [x16, x21]\n"
+ "fmls v10.4s, v2.4s, v0.s[2]\n"
+ "ldr d16, [x16, x19]\n"
+ "fmls v9.4s, v2.4s, v0.s[2]\n"
+ "ldr d22, [x16, %[input_col_stride1]]\n"
+ "fsub v7.4s, v7.4s, v2.4s\n"
+ "ldr d17, [x17, x20]\n"
+ "fadd v10.4s, v10.4s, v4.4s\n"
+ "ldr d15, [x17, x18]\n"
+ "fsub v9.4s, v9.4s, v4.4s\n"
+ "ldr d19, [x17]\n"
+ "fmla v7.4s, v4.4s, v0.s[1]\n"
+ "ldr d18, [x17, x21]\n"
+ "mov v8.16b, v8.16b\n"
+ "ldr d13, [x17, x19]\n"
+ "mov v11.16b, v1.16b\n"
+ "ldr d21, [x17, %[input_col_stride1]]\n"
+ "fmla v8.4s, v12.4s, v0.s[1]\n"
+ "add %[inptr0], %[inptr0], #8\n"
+ "fmla v11.4s, v12.4s, v0.s[2]\n"
+ "add x16, x16, #8\n"
+ "mov v1.16b, v5.16b\n"
+ "add x17, x17, #8\n"
+ "fsub v8.4s, v8.4s, v2.4s\n"
+ "mov v2.16b, v5.16b\n"
+ "fmls v8.4s, v4.4s, v0.s[1]\n"
+ "fmls v11.4s, v4.4s, v0.s[3]\n"
+ "fmla v1.4s, v3.4s, v0.s[2]\n"
+ "fmls v2.4s, v22.4s, v0.s[2]\n"
+ "mov v3.16b, v5.16b\n"
+ "mov v4.16b, v5.16b\n"
+ "mov v5.16b, v5.16b\n"
+ "mov v6.16b, v6.16b\n"
+ "fmls v1.4s, v20.4s, v0.s[3]\n"
+ "fmls v2.4s, v20.4s, v0.s[2]\n"
+ "fmla v3.4s, v22.4s, v0.s[2]\n"
+ "fmls v4.4s, v22.4s, v0.s[1]\n"
+ "fmla v5.4s, v22.4s, v0.s[1]\n"
+ "fmla v6.4s, v22.4s, v0.s[2]\n"
+ "fadd v2.4s, v2.4s, v16.4s\n"
+ "mov v12.16b, v17.16b\n"
+ "fmls v3.4s, v20.4s, v0.s[2]\n"
+ "fsub v4.4s, v4.4s, v20.4s\n"
+ "fmla v4.4s, v16.4s, v0.s[1]\n"
+ "fsub v5.4s, v5.4s, v20.4s\n"
+ "fmls v5.4s, v16.4s, v0.s[1]\n"
+ "fmls v6.4s, v16.4s, v0.s[3]\n"
+ "fsub v3.4s, v3.4s, v16.4s\n"
+ "fmla v12.4s, v19.4s, v0.s[2]\n"
+ "mov v19.16b, v17.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "mov v16.16b, v17.16b\n"
+ "mov v17.16b, v17.16b\n"
+ "fmls v12.4s, v15.4s, v0.s[3]\n"
+ "fmls v19.4s, v21.4s, v0.s[2]\n"
+ "fmla v20.4s, v21.4s, v0.s[2]\n"
+ "fmls v16.4s, v21.4s, v0.s[1]\n"
+ "fmla v17.4s, v21.4s, v0.s[1]\n"
+ "mov v18.16b, v18.16b\n"
+ "fmls v19.4s, v15.4s, v0.s[2]\n"
+ "mov v23.16b, v12.16b\n"
+ "fmls v20.4s, v15.4s, v0.s[2]\n"
+ "fsub v16.4s, v16.4s, v15.4s\n"
+ "fmla v16.4s, v13.4s, v0.s[1]\n"
+ "fsub v17.4s, v17.4s, v15.4s\n"
+ "fadd v19.4s, v19.4s, v13.4s\n"
+ "fmls v17.4s, v13.4s, v0.s[1]\n"
+ "fsub v20.4s, v20.4s, v13.4s\n"
+ "fmla v18.4s, v21.4s, v0.s[2]\n"
+ "fmla v23.4s, v14.4s, v0.s[2]\n"
+ "mov v15.16b, v19.16b\n"
+ "mov v14.16b, v20.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "fmls v18.4s, v13.4s, v0.s[3]\n"
+ "fmla v15.4s, v10.4s, v0.s[2]\n"
+ "fmls v23.4s, v1.4s, v0.s[3]\n"
+ "fmla v14.4s, v9.4s, v0.s[2]\n"
+ "fmla v24.4s, v7.4s, v0.s[2]\n"
+ "mov v10.16b, v17.16b\n"
+ "fmls v15.4s, v2.4s, v0.s[3]\n"
+ "mov v7.16b, v18.16b\n"
+ "str d23, [%[outptr0]]\n"
+ "fmls v14.4s, v3.4s, v0.s[3]\n"
+ "fmls v24.4s, v4.4s, v0.s[3]\n"
+ "fmla v10.4s, v8.4s, v0.s[2]\n"
+ "str d15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v7.4s, v11.4s, v0.s[2]\n"
+ "str d14, [%[outptr0], x11]\n"
+ "fmls v10.4s, v5.4s, v0.s[3]\n"
+ "str d24, [%[outptr0], x13]\n"
+ "fmls v7.4s, v6.4s, v0.s[3]\n"
+ "str d10, [%[outptr0], x23]\n"
+ "str d7, [%[outptr0], x15]\n"
+ "add %[outptr0], %[outptr0], #8\n"
+ "mov v26.16b, v12.16b\n"
+ "mov v25.16b, v19.16b\n"
+ "ldr d11, [x25, x20]\n"
+ "mov v10.16b, v11.16b\n"
+ "ldr d23, [x25, x18]\n"
+ "mov v9.16b, v11.16b\n"
+ "ldr d7, [x25]\n"
+ "fmla v10.4s, v7.4s, v0.s[2]\n"
+ "ldr d13, [x25, x21]\n"
+ "mov v7.16b, v11.16b\n"
+ "ldr d31, [x25, x19]\n"
+ "mov v8.16b, v11.16b\n"
+ "ldr d21, [x25, %[input_col_stride1]]\n"
+ "fmls v10.4s, v23.4s, v0.s[3]\n"
+ "ldr d30, [x26, x20]\n"
+ "fmls v9.4s, v21.4s, v0.s[2]\n"
+ "ldr d29, [x26, x18]\n"
+ "fmla v7.4s, v21.4s, v0.s[2]\n"
+ "ldr d22, [x26]\n"
+ "fmls v8.4s, v21.4s, v0.s[1]\n"
+ "ldr d24, [x26, x21]\n"
+ "fmls v9.4s, v23.4s, v0.s[2]\n"
+ "ldr d27, [x26, x19]\n"
+ "fmls v7.4s, v23.4s, v0.s[2]\n"
+ "ldr d28, [x26, %[input_col_stride1]]\n"
+ "fsub v8.4s, v8.4s, v23.4s\n"
+ "add x25, x25, #8\n"
+ "fadd v9.4s, v9.4s, v31.4s\n"
+ "add x26, x26, #8\n"
+ "fsub v7.4s, v7.4s, v31.4s\n"
+ "fmla v8.4s, v31.4s, v0.s[1]\n"
+ "mov v11.16b, v11.16b\n"
+ "mov v15.16b, v13.16b\n"
+ "mov v14.16b, v30.16b\n"
+ "mov v13.16b, v30.16b\n"
+ "fmla v11.4s, v21.4s, v0.s[1]\n"
+ "fmla v15.4s, v21.4s, v0.s[2]\n"
+ "fmla v14.4s, v22.4s, v0.s[2]\n"
+ "fmls v13.4s, v28.4s, v0.s[2]\n"
+ "mov v21.16b, v30.16b\n"
+ "mov v22.16b, v30.16b\n"
+ "fsub v11.4s, v11.4s, v23.4s\n"
+ "fmls v15.4s, v31.4s, v0.s[3]\n"
+ "fmls v11.4s, v31.4s, v0.s[1]\n"
+ "fmls v14.4s, v29.4s, v0.s[3]\n"
+ "fmls v13.4s, v29.4s, v0.s[2]\n"
+ "fmla v21.4s, v28.4s, v0.s[2]\n"
+ "fmls v22.4s, v28.4s, v0.s[1]\n"
+ "mov v23.16b, v30.16b\n"
+ "mov v24.16b, v24.16b\n"
+ "fmls v26.4s, v10.4s, v0.s[2]\n"
+ "fadd v13.4s, v13.4s, v27.4s\n"
+ "fmls v21.4s, v29.4s, v0.s[2]\n"
+ "fsub v22.4s, v22.4s, v29.4s\n"
+ "fmla v23.4s, v28.4s, v0.s[1]\n"
+ "fmla v22.4s, v27.4s, v0.s[1]\n"
+ "fmla v24.4s, v28.4s, v0.s[2]\n"
+ "fsub v21.4s, v21.4s, v27.4s\n"
+ "fmls v26.4s, v1.4s, v0.s[2]\n"
+ "fsub v23.4s, v23.4s, v29.4s\n"
+ "fmls v25.4s, v9.4s, v0.s[2]\n"
+ "fmls v23.4s, v27.4s, v0.s[1]\n"
+ "fmls v24.4s, v27.4s, v0.s[3]\n"
+ "fadd v26.4s, v26.4s, v14.4s\n"
+ "mov v27.16b, v20.16b\n"
+ "str d26, [x28]\n"
+ "fmls v25.4s, v2.4s, v0.s[2]\n"
+ "fmls v27.4s, v7.4s, v0.s[2]\n"
+ "mov v31.16b, v16.16b\n"
+ "mov v30.16b, v17.16b\n"
+ "mov v29.16b, v18.16b\n"
+ "fadd v25.4s, v25.4s, v13.4s\n"
+ "fmls v31.4s, v8.4s, v0.s[2]\n"
+ "str d25, [x28, %[output_col_stride1]]\n"
+ "fmls v27.4s, v3.4s, v0.s[2]\n"
+ "fmls v30.4s, v11.4s, v0.s[2]\n"
+ "fmls v29.4s, v15.4s, v0.s[2]\n"
+ "fmls v31.4s, v4.4s, v0.s[2]\n"
+ "mov v26.16b, v12.16b\n"
+ "fadd v27.4s, v27.4s, v21.4s\n"
+ "mov v25.16b, v19.16b\n"
+ "str d27, [x28, x11]\n"
+ "fmls v30.4s, v5.4s, v0.s[2]\n"
+ "fadd v31.4s, v31.4s, v22.4s\n"
+ "fmls v29.4s, v6.4s, v0.s[2]\n"
+ "str d31, [x28, x13]\n"
+ "fmla v26.4s, v10.4s, v0.s[2]\n"
+ "fadd v30.4s, v30.4s, v23.4s\n"
+ "fmla v25.4s, v9.4s, v0.s[2]\n"
+ "str d30, [x28, x23]\n"
+ "fadd v29.4s, v29.4s, v24.4s\n"
+ "str d29, [x28, x15]\n"
+ "fmls v26.4s, v1.4s, v0.s[2]\n"
+ "fmls v25.4s, v2.4s, v0.s[2]\n"
+ "add x28, x28, #8\n"
+ "mov v30.16b, v20.16b\n"
+ "mov v29.16b, v16.16b\n"
+ "fsub v26.4s, v26.4s, v14.4s\n"
+ "mov v28.16b, v17.16b\n"
+ "str d26, [x22]\n"
+ "fsub v25.4s, v25.4s, v13.4s\n"
+ "str d25, [x22, %[output_col_stride1]]\n"
+ "fmla v30.4s, v7.4s, v0.s[2]\n"
+ "fmla v29.4s, v8.4s, v0.s[2]\n"
+ "fmla v28.4s, v11.4s, v0.s[2]\n"
+ "mov v26.16b, v18.16b\n"
+ "mov v25.16b, v12.16b\n"
+ "fmls v30.4s, v3.4s, v0.s[2]\n"
+ "mov v31.16b, v19.16b\n"
+ "fmls v29.4s, v4.4s, v0.s[2]\n"
+ "fmls v28.4s, v5.4s, v0.s[2]\n"
+ "fmla v26.4s, v15.4s, v0.s[2]\n"
+ "fmls v25.4s, v10.4s, v0.s[1]\n"
+ "fsub v30.4s, v30.4s, v21.4s\n"
+ "fmls v31.4s, v9.4s, v0.s[1]\n"
+ "str d30, [x22, x11]\n"
+ "fsub v29.4s, v29.4s, v22.4s\n"
+ "str d29, [x22, x13]\n"
+ "fsub v28.4s, v28.4s, v23.4s\n"
+ "str d28, [x22, x23]\n"
+ "fmls v26.4s, v6.4s, v0.s[2]\n"
+ "fsub v25.4s, v25.4s, v1.4s\n"
+ "fsub v31.4s, v31.4s, v2.4s\n"
+ "fmla v25.4s, v14.4s, v0.s[1]\n"
+ "fmla v31.4s, v13.4s, v0.s[1]\n"
+ "fsub v26.4s, v26.4s, v24.4s\n"
+ "mov v27.16b, v20.16b\n"
+ "str d26, [x22, x15]\n"
+ "mov v26.16b, v16.16b\n"
+ "str d25, [x12]\n"
+ "fmls v27.4s, v7.4s, v0.s[1]\n"
+ "str d31, [x12, %[output_col_stride1]]\n"
+ "fmls v26.4s, v8.4s, v0.s[1]\n"
+ "mov v25.16b, v17.16b\n"
+ "add x22, x22, #8\n"
+ "fsub v27.4s, v27.4s, v3.4s\n"
+ "mov v28.16b, v18.16b\n"
+ "fmla v27.4s, v21.4s, v0.s[1]\n"
+ "fsub v26.4s, v26.4s, v4.4s\n"
+ "fmla v26.4s, v22.4s, v0.s[1]\n"
+ "fmls v25.4s, v11.4s, v0.s[1]\n"
+ "fmls v28.4s, v15.4s, v0.s[1]\n"
+ "mov v12.16b, v12.16b\n"
+ "str d27, [x12, x11]\n"
+ "mov v19.16b, v19.16b\n"
+ "str d26, [x12, x13]\n"
+ "fsub v25.4s, v25.4s, v5.4s\n"
+ "fmla v25.4s, v23.4s, v0.s[1]\n"
+ "fsub v28.4s, v28.4s, v6.4s\n"
+ "fmla v28.4s, v24.4s, v0.s[1]\n"
+ "fmla v12.4s, v10.4s, v0.s[1]\n"
+ "fmla v19.4s, v9.4s, v0.s[1]\n"
+ "mov v20.16b, v20.16b\n"
+ "str d25, [x12, x23]\n"
+ "mov v16.16b, v16.16b\n"
+ "str d28, [x12, x15]\n"
+ "fsub v12.4s, v12.4s, v1.4s\n"
+ "fmls v12.4s, v14.4s, v0.s[1]\n"
+ "add x12, x12, #8\n"
+ "fsub v19.4s, v19.4s, v2.4s\n"
+ "fmla v20.4s, v7.4s, v0.s[1]\n"
+ "fmls v19.4s, v13.4s, v0.s[1]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "str d12, [x14]\n"
+ "mov v1.16b, v17.16b\n"
+ "fsub v20.4s, v20.4s, v3.4s\n"
+ "mov v17.16b, v18.16b\n"
+ "str d19, [x14, %[output_col_stride1]]\n"
+ "fmls v20.4s, v21.4s, v0.s[1]\n"
+ "fsub v16.4s, v16.4s, v4.4s\n"
+ "fmla v1.4s, v11.4s, v0.s[1]\n"
+ "fmls v16.4s, v22.4s, v0.s[1]\n"
+ "fmla v17.4s, v15.4s, v0.s[1]\n"
+ "str d20, [x14, x11]\n"
+ "fsub v1.4s, v1.4s, v5.4s\n"
+ "str d16, [x14, x13]\n"
+ "fmls v1.4s, v23.4s, v0.s[1]\n"
+ "fsub v17.4s, v17.4s, v6.4s\n"
+ "fmls v17.4s, v24.4s, v0.s[1]\n"
+ "str d1, [x14, x23]\n"
+ "str d17, [x14, x15]\n"
+ "add x14, x14, #8\n"
+ "ldr d2, [x27, x20]\n"
+ "mov v4.16b, v2.16b\n"
+ "ldr d17, [x27, x18]\n"
+ "mov v12.16b, v2.16b\n"
+ "ldr d18, [x27]\n"
+ "fmla v4.4s, v18.4s, v0.s[2]\n"
+ "ldr d3, [x27, x21]\n"
+ "mov v6.16b, v2.16b\n"
+ "ldr d5, [x27, x19]\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d18, [x27, %[input_col_stride1]]\n"
+ "fmls v4.4s, v17.4s, v0.s[3]\n"
+ "add x27, x27, #8\n"
+ "fmls v12.4s, v18.4s, v0.s[2]\n"
+ "sub %w[n_channels], %w[n_channels], #2\n"
+ "fmla v6.4s, v18.4s, v0.s[2]\n"
+ "fmls v1.4s, v18.4s, v0.s[1]\n"
+ "mov v2.16b, v2.16b\n"
+ "mov v3.16b, v3.16b\n"
+ "fmls v12.4s, v17.4s, v0.s[2]\n"
+ "mov v4.16b, v4.16b\n"
+ "fmls v6.4s, v17.4s, v0.s[2]\n"
+ "fsub v1.4s, v1.4s, v17.4s\n"
+ "fmla v1.4s, v5.4s, v0.s[1]\n"
+ "fmla v2.4s, v18.4s, v0.s[1]\n"
+ "fadd v12.4s, v12.4s, v5.4s\n"
+ "fmla v3.4s, v18.4s, v0.s[2]\n"
+ "fsub v6.4s, v6.4s, v5.4s\n"
+ "fmla v4.4s, v10.4s, v0.s[2]\n"
+ "fsub v2.4s, v2.4s, v17.4s\n"
+ "mov v16.16b, v12.16b\n"
+ "fmls v2.4s, v5.4s, v0.s[1]\n"
+ "fmls v3.4s, v5.4s, v0.s[3]\n"
+ "fmls v4.4s, v14.4s, v0.s[3]\n"
+ "fmla v16.4s, v9.4s, v0.s[2]\n"
+ "mov v5.16b, v6.16b\n"
+ "mov v6.16b, v1.16b\n"
+ "mov v9.16b, v2.16b\n"
+ "mov v10.16b, v3.16b\n"
+ "str d4, [x24]\n"
+ "fmls v16.4s, v13.4s, v0.s[3]\n"
+ "fmla v5.4s, v7.4s, v0.s[2]\n"
+ "fmla v6.4s, v8.4s, v0.s[2]\n"
+ "fmla v9.4s, v11.4s, v0.s[2]\n"
+ "fmla v10.4s, v15.4s, v0.s[2]\n"
+ "str d16, [x24, %[output_col_stride1]]\n"
+ "fmls v5.4s, v21.4s, v0.s[3]\n"
+ "fmls v6.4s, v22.4s, v0.s[3]\n"
+ "fmls v9.4s, v23.4s, v0.s[3]\n"
+ "fmls v10.4s, v24.4s, v0.s[3]\n"
+ "str d5, [x24, x11]\n"
+ "str d6, [x24, x13]\n"
+ "str d9, [x24, x23]\n"
+ "str d10, [x24, x15]\n"
+ "add x24, x24, #8\n"
+ "3:\n"
+ "cbz %w[n_channels], 4f\n"
+ "ldr s8, [%[inptr0], x20]\n"
+ "mov v14.16b, v8.16b\n"
+ "ldr s2, [%[inptr0], x18]\n"
+ "mov v10.16b, v8.16b\n"
+ "ldr s9, [%[inptr0]]\n"
+ "fmla v14.4s, v9.4s, v0.s[2]\n"
+ "ldr s1, [%[inptr0], x21]\n"
+ "mov v9.16b, v8.16b\n"
+ "ldr s4, [%[inptr0], x19]\n"
+ "mov v7.16b, v8.16b\n"
+ "ldr s12, [%[inptr0], %[input_col_stride1]]\n"
+ "fmls v14.4s, v2.4s, v0.s[3]\n"
+ "ldr s5, [x16, x20]\n"
+ "fmls v10.4s, v12.4s, v0.s[2]\n"
+ "ldr s20, [x16, x18]\n"
+ "fmla v9.4s, v12.4s, v0.s[2]\n"
+ "ldr s3, [x16]\n"
+ "fmls v7.4s, v12.4s, v0.s[1]\n"
+ "ldr s6, [x16, x21]\n"
+ "fmls v10.4s, v2.4s, v0.s[2]\n"
+ "ldr s16, [x16, x19]\n"
+ "fmls v9.4s, v2.4s, v0.s[2]\n"
+ "ldr s22, [x16, %[input_col_stride1]]\n"
+ "fsub v7.4s, v7.4s, v2.4s\n"
+ "ldr s17, [x17, x20]\n"
+ "fadd v10.4s, v10.4s, v4.4s\n"
+ "ldr s15, [x17, x18]\n"
+ "fsub v9.4s, v9.4s, v4.4s\n"
+ "ldr s19, [x17]\n"
+ "fmla v7.4s, v4.4s, v0.s[1]\n"
+ "ldr s18, [x17, x21]\n"
+ "mov v8.16b, v8.16b\n"
+ "ldr s13, [x17, x19]\n"
+ "mov v11.16b, v1.16b\n"
+ "ldr s21, [x17, %[input_col_stride1]]\n"
+ "fmla v8.4s, v12.4s, v0.s[1]\n"
+ "add %[inptr0], %[inptr0], #4\n"
+ "fmla v11.4s, v12.4s, v0.s[2]\n"
+ "add x16, x16, #4\n"
+ "mov v1.16b, v5.16b\n"
+ "add x17, x17, #4\n"
+ "fsub v8.4s, v8.4s, v2.4s\n"
+ "mov v2.16b, v5.16b\n"
+ "fmls v8.4s, v4.4s, v0.s[1]\n"
+ "fmls v11.4s, v4.4s, v0.s[3]\n"
+ "fmla v1.4s, v3.4s, v0.s[2]\n"
+ "fmls v2.4s, v22.4s, v0.s[2]\n"
+ "mov v3.16b, v5.16b\n"
+ "mov v4.16b, v5.16b\n"
+ "mov v5.16b, v5.16b\n"
+ "mov v6.16b, v6.16b\n"
+ "fmls v1.4s, v20.4s, v0.s[3]\n"
+ "fmls v2.4s, v20.4s, v0.s[2]\n"
+ "fmla v3.4s, v22.4s, v0.s[2]\n"
+ "fmls v4.4s, v22.4s, v0.s[1]\n"
+ "fmla v5.4s, v22.4s, v0.s[1]\n"
+ "fmla v6.4s, v22.4s, v0.s[2]\n"
+ "fadd v2.4s, v2.4s, v16.4s\n"
+ "mov v12.16b, v17.16b\n"
+ "fmls v3.4s, v20.4s, v0.s[2]\n"
+ "fsub v4.4s, v4.4s, v20.4s\n"
+ "fmla v4.4s, v16.4s, v0.s[1]\n"
+ "fsub v5.4s, v5.4s, v20.4s\n"
+ "fmls v5.4s, v16.4s, v0.s[1]\n"
+ "fmls v6.4s, v16.4s, v0.s[3]\n"
+ "fsub v3.4s, v3.4s, v16.4s\n"
+ "fmla v12.4s, v19.4s, v0.s[2]\n"
+ "mov v19.16b, v17.16b\n"
+ "mov v20.16b, v17.16b\n"
+ "mov v16.16b, v17.16b\n"
+ "mov v17.16b, v17.16b\n"
+ "fmls v12.4s, v15.4s, v0.s[3]\n"
+ "fmls v19.4s, v21.4s, v0.s[2]\n"
+ "fmla v20.4s, v21.4s, v0.s[2]\n"
+ "fmls v16.4s, v21.4s, v0.s[1]\n"
+ "fmla v17.4s, v21.4s, v0.s[1]\n"
+ "mov v18.16b, v18.16b\n"
+ "fmls v19.4s, v15.4s, v0.s[2]\n"
+ "mov v23.16b, v12.16b\n"
+ "fmls v20.4s, v15.4s, v0.s[2]\n"
+ "fsub v16.4s, v16.4s, v15.4s\n"
+ "fmla v16.4s, v13.4s, v0.s[1]\n"
+ "fsub v17.4s, v17.4s, v15.4s\n"
+ "fadd v19.4s, v19.4s, v13.4s\n"
+ "fmls v17.4s, v13.4s, v0.s[1]\n"
+ "fsub v20.4s, v20.4s, v13.4s\n"
+ "fmla v18.4s, v21.4s, v0.s[2]\n"
+ "fmla v23.4s, v14.4s, v0.s[2]\n"
+ "mov v15.16b, v19.16b\n"
+ "mov v14.16b, v20.16b\n"
+ "mov v24.16b, v16.16b\n"
+ "fmls v18.4s, v13.4s, v0.s[3]\n"
+ "fmla v15.4s, v10.4s, v0.s[2]\n"
+ "fmls v23.4s, v1.4s, v0.s[3]\n"
+ "fmla v14.4s, v9.4s, v0.s[2]\n"
+ "fmla v24.4s, v7.4s, v0.s[2]\n"
+ "mov v10.16b, v17.16b\n"
+ "fmls v15.4s, v2.4s, v0.s[3]\n"
+ "mov v7.16b, v18.16b\n"
+ "str s23, [%[outptr0]]\n"
+ "fmls v14.4s, v3.4s, v0.s[3]\n"
+ "fmls v24.4s, v4.4s, v0.s[3]\n"
+ "fmla v10.4s, v8.4s, v0.s[2]\n"
+ "str s15, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v7.4s, v11.4s, v0.s[2]\n"
+ "str s14, [%[outptr0], x11]\n"
+ "fmls v10.4s, v5.4s, v0.s[3]\n"
+ "str s24, [%[outptr0], x13]\n"
+ "fmls v7.4s, v6.4s, v0.s[3]\n"
+ "str s10, [%[outptr0], x23]\n"
+ "str s7, [%[outptr0], x15]\n"
+ "add %[outptr0], %[outptr0], #4\n"
+ "mov v26.16b, v12.16b\n"
+ "mov v25.16b, v19.16b\n"
+ "ldr s11, [x25, x20]\n"
+ "mov v10.16b, v11.16b\n"
+ "ldr s23, [x25, x18]\n"
+ "mov v9.16b, v11.16b\n"
+ "ldr s7, [x25]\n"
+ "fmla v10.4s, v7.4s, v0.s[2]\n"
+ "ldr s13, [x25, x21]\n"
+ "mov v7.16b, v11.16b\n"
+ "ldr s31, [x25, x19]\n"
+ "mov v8.16b, v11.16b\n"
+ "ldr s21, [x25, %[input_col_stride1]]\n"
+ "fmls v10.4s, v23.4s, v0.s[3]\n"
+ "ldr s30, [x26, x20]\n"
+ "fmls v9.4s, v21.4s, v0.s[2]\n"
+ "ldr s29, [x26, x18]\n"
+ "fmla v7.4s, v21.4s, v0.s[2]\n"
+ "ldr s22, [x26]\n"
+ "fmls v8.4s, v21.4s, v0.s[1]\n"
+ "ldr s24, [x26, x21]\n"
+ "fmls v9.4s, v23.4s, v0.s[2]\n"
+ "ldr s27, [x26, x19]\n"
+ "fmls v7.4s, v23.4s, v0.s[2]\n"
+ "ldr s28, [x26, %[input_col_stride1]]\n"
+ "fsub v8.4s, v8.4s, v23.4s\n"
+ "add x25, x25, #4\n"
+ "fadd v9.4s, v9.4s, v31.4s\n"
+ "add x26, x26, #4\n"
+ "fsub v7.4s, v7.4s, v31.4s\n"
+ "fmla v8.4s, v31.4s, v0.s[1]\n"
+ "mov v11.16b, v11.16b\n"
+ "mov v15.16b, v13.16b\n"
+ "mov v14.16b, v30.16b\n"
+ "mov v13.16b, v30.16b\n"
+ "fmla v11.4s, v21.4s, v0.s[1]\n"
+ "fmla v15.4s, v21.4s, v0.s[2]\n"
+ "fmla v14.4s, v22.4s, v0.s[2]\n"
+ "fmls v13.4s, v28.4s, v0.s[2]\n"
+ "mov v21.16b, v30.16b\n"
+ "mov v22.16b, v30.16b\n"
+ "fsub v11.4s, v11.4s, v23.4s\n"
+ "fmls v15.4s, v31.4s, v0.s[3]\n"
+ "fmls v11.4s, v31.4s, v0.s[1]\n"
+ "fmls v14.4s, v29.4s, v0.s[3]\n"
+ "fmls v13.4s, v29.4s, v0.s[2]\n"
+ "fmla v21.4s, v28.4s, v0.s[2]\n"
+ "fmls v22.4s, v28.4s, v0.s[1]\n"
+ "mov v23.16b, v30.16b\n"
+ "mov v24.16b, v24.16b\n"
+ "fmls v26.4s, v10.4s, v0.s[2]\n"
+ "fadd v13.4s, v13.4s, v27.4s\n"
+ "fmls v21.4s, v29.4s, v0.s[2]\n"
+ "fsub v22.4s, v22.4s, v29.4s\n"
+ "fmla v23.4s, v28.4s, v0.s[1]\n"
+ "fmla v22.4s, v27.4s, v0.s[1]\n"
+ "fmla v24.4s, v28.4s, v0.s[2]\n"
+ "fsub v21.4s, v21.4s, v27.4s\n"
+ "fmls v26.4s, v1.4s, v0.s[2]\n"
+ "fsub v23.4s, v23.4s, v29.4s\n"
+ "fmls v25.4s, v9.4s, v0.s[2]\n"
+ "fmls v23.4s, v27.4s, v0.s[1]\n"
+ "fmls v24.4s, v27.4s, v0.s[3]\n"
+ "fadd v26.4s, v26.4s, v14.4s\n"
+ "mov v27.16b, v20.16b\n"
+ "str s26, [x28]\n"
+ "fmls v25.4s, v2.4s, v0.s[2]\n"
+ "fmls v27.4s, v7.4s, v0.s[2]\n"
+ "mov v31.16b, v16.16b\n"
+ "mov v30.16b, v17.16b\n"
+ "mov v29.16b, v18.16b\n"
+ "fadd v25.4s, v25.4s, v13.4s\n"
+ "fmls v31.4s, v8.4s, v0.s[2]\n"
+ "str s25, [x28, %[output_col_stride1]]\n"
+ "fmls v27.4s, v3.4s, v0.s[2]\n"
+ "fmls v30.4s, v11.4s, v0.s[2]\n"
+ "fmls v29.4s, v15.4s, v0.s[2]\n"
+ "fmls v31.4s, v4.4s, v0.s[2]\n"
+ "mov v26.16b, v12.16b\n"
+ "fadd v27.4s, v27.4s, v21.4s\n"
+ "mov v25.16b, v19.16b\n"
+ "str s27, [x28, x11]\n"
+ "fmls v30.4s, v5.4s, v0.s[2]\n"
+ "fadd v31.4s, v31.4s, v22.4s\n"
+ "fmls v29.4s, v6.4s, v0.s[2]\n"
+ "str s31, [x28, x13]\n"
+ "fmla v26.4s, v10.4s, v0.s[2]\n"
+ "fadd v30.4s, v30.4s, v23.4s\n"
+ "fmla v25.4s, v9.4s, v0.s[2]\n"
+ "str s30, [x28, x23]\n"
+ "fadd v29.4s, v29.4s, v24.4s\n"
+ "str s29, [x28, x15]\n"
+ "fmls v26.4s, v1.4s, v0.s[2]\n"
+ "fmls v25.4s, v2.4s, v0.s[2]\n"
+ "add x28, x28, #4\n"
+ "mov v30.16b, v20.16b\n"
+ "mov v29.16b, v16.16b\n"
+ "fsub v26.4s, v26.4s, v14.4s\n"
+ "mov v28.16b, v17.16b\n"
+ "str s26, [x22]\n"
+ "fsub v25.4s, v25.4s, v13.4s\n"
+ "str s25, [x22, %[output_col_stride1]]\n"
+ "fmla v30.4s, v7.4s, v0.s[2]\n"
+ "fmla v29.4s, v8.4s, v0.s[2]\n"
+ "fmla v28.4s, v11.4s, v0.s[2]\n"
+ "mov v26.16b, v18.16b\n"
+ "mov v25.16b, v12.16b\n"
+ "fmls v30.4s, v3.4s, v0.s[2]\n"
+ "mov v31.16b, v19.16b\n"
+ "fmls v29.4s, v4.4s, v0.s[2]\n"
+ "fmls v28.4s, v5.4s, v0.s[2]\n"
+ "fmla v26.4s, v15.4s, v0.s[2]\n"
+ "fmls v25.4s, v10.4s, v0.s[1]\n"
+ "fsub v30.4s, v30.4s, v21.4s\n"
+ "fmls v31.4s, v9.4s, v0.s[1]\n"
+ "str s30, [x22, x11]\n"
+ "fsub v29.4s, v29.4s, v22.4s\n"
+ "str s29, [x22, x13]\n"
+ "fsub v28.4s, v28.4s, v23.4s\n"
+ "str s28, [x22, x23]\n"
+ "fmls v26.4s, v6.4s, v0.s[2]\n"
+ "fsub v25.4s, v25.4s, v1.4s\n"
+ "fsub v31.4s, v31.4s, v2.4s\n"
+ "fmla v25.4s, v14.4s, v0.s[1]\n"
+ "fmla v31.4s, v13.4s, v0.s[1]\n"
+ "fsub v26.4s, v26.4s, v24.4s\n"
+ "mov v27.16b, v20.16b\n"
+ "str s26, [x22, x15]\n"
+ "mov v26.16b, v16.16b\n"
+ "str s25, [x12]\n"
+ "fmls v27.4s, v7.4s, v0.s[1]\n"
+ "str s31, [x12, %[output_col_stride1]]\n"
+ "fmls v26.4s, v8.4s, v0.s[1]\n"
+ "mov v25.16b, v17.16b\n"
+ "add x22, x22, #4\n"
+ "fsub v27.4s, v27.4s, v3.4s\n"
+ "mov v28.16b, v18.16b\n"
+ "fmla v27.4s, v21.4s, v0.s[1]\n"
+ "fsub v26.4s, v26.4s, v4.4s\n"
+ "fmla v26.4s, v22.4s, v0.s[1]\n"
+ "fmls v25.4s, v11.4s, v0.s[1]\n"
+ "fmls v28.4s, v15.4s, v0.s[1]\n"
+ "mov v12.16b, v12.16b\n"
+ "str s27, [x12, x11]\n"
+ "mov v19.16b, v19.16b\n"
+ "str s26, [x12, x13]\n"
+ "fsub v25.4s, v25.4s, v5.4s\n"
+ "fmla v25.4s, v23.4s, v0.s[1]\n"
+ "fsub v28.4s, v28.4s, v6.4s\n"
+ "fmla v28.4s, v24.4s, v0.s[1]\n"
+ "fmla v12.4s, v10.4s, v0.s[1]\n"
+ "fmla v19.4s, v9.4s, v0.s[1]\n"
+ "mov v20.16b, v20.16b\n"
+ "str s25, [x12, x23]\n"
+ "mov v16.16b, v16.16b\n"
+ "str s28, [x12, x15]\n"
+ "fsub v12.4s, v12.4s, v1.4s\n"
+ "fmls v12.4s, v14.4s, v0.s[1]\n"
+ "add x12, x12, #4\n"
+ "fsub v19.4s, v19.4s, v2.4s\n"
+ "fmla v20.4s, v7.4s, v0.s[1]\n"
+ "fmls v19.4s, v13.4s, v0.s[1]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "str s12, [x14]\n"
+ "mov v1.16b, v17.16b\n"
+ "fsub v20.4s, v20.4s, v3.4s\n"
+ "mov v17.16b, v18.16b\n"
+ "str s19, [x14, %[output_col_stride1]]\n"
+ "fmls v20.4s, v21.4s, v0.s[1]\n"
+ "fsub v16.4s, v16.4s, v4.4s\n"
+ "fmla v1.4s, v11.4s, v0.s[1]\n"
+ "fmls v16.4s, v22.4s, v0.s[1]\n"
+ "fmla v17.4s, v15.4s, v0.s[1]\n"
+ "str s20, [x14, x11]\n"
+ "fsub v1.4s, v1.4s, v5.4s\n"
+ "str s16, [x14, x13]\n"
+ "fmls v1.4s, v23.4s, v0.s[1]\n"
+ "fsub v17.4s, v17.4s, v6.4s\n"
+ "fmls v17.4s, v24.4s, v0.s[1]\n"
+ "str s1, [x14, x23]\n"
+ "str s17, [x14, x15]\n"
+ "add x14, x14, #4\n"
+ "ldr s2, [x27, x20]\n"
+ "mov v4.16b, v2.16b\n"
+ "ldr s17, [x27, x18]\n"
+ "mov v12.16b, v2.16b\n"
+ "ldr s18, [x27]\n"
+ "fmla v4.4s, v18.4s, v0.s[2]\n"
+ "ldr s3, [x27, x21]\n"
+ "mov v6.16b, v2.16b\n"
+ "ldr s5, [x27, x19]\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr s18, [x27, %[input_col_stride1]]\n"
+ "fmls v4.4s, v17.4s, v0.s[3]\n"
+ "add x27, x27, #4\n"
+ "fmls v12.4s, v18.4s, v0.s[2]\n"
+ "fmla v6.4s, v18.4s, v0.s[2]\n"
+ "fmls v1.4s, v18.4s, v0.s[1]\n"
+ "mov v2.16b, v2.16b\n"
+ "mov v3.16b, v3.16b\n"
+ "mov v4.16b, v4.16b\n"
+ "fmls v12.4s, v17.4s, v0.s[2]\n"
+ "fmls v6.4s, v17.4s, v0.s[2]\n"
+ "fsub v1.4s, v1.4s, v17.4s\n"
+ "fmla v2.4s, v18.4s, v0.s[1]\n"
+ "fmla v1.4s, v5.4s, v0.s[1]\n"
+ "fmla v3.4s, v18.4s, v0.s[2]\n"
+ "fadd v12.4s, v12.4s, v5.4s\n"
+ "fsub v6.4s, v6.4s, v5.4s\n"
+ "fsub v2.4s, v2.4s, v17.4s\n"
+ "fmla v4.4s, v10.4s, v0.s[2]\n"
+ "fmls v2.4s, v5.4s, v0.s[1]\n"
+ "fmls v3.4s, v5.4s, v0.s[3]\n"
+ "mov v16.16b, v12.16b\n"
+ "mov v5.16b, v6.16b\n"
+ "fmls v4.4s, v14.4s, v0.s[3]\n"
+ "mov v6.16b, v1.16b\n"
+ "fmla v16.4s, v9.4s, v0.s[2]\n"
+ "fmla v5.4s, v7.4s, v0.s[2]\n"
+ "fmla v6.4s, v8.4s, v0.s[2]\n"
+ "mov v9.16b, v2.16b\n"
+ "str s4, [x24]\n"
+ "mov v10.16b, v3.16b\n"
+ "fmls v16.4s, v13.4s, v0.s[3]\n"
+ "fmls v5.4s, v21.4s, v0.s[3]\n"
+ "fmls v6.4s, v22.4s, v0.s[3]\n"
+ "fmla v9.4s, v11.4s, v0.s[2]\n"
+ "fmla v10.4s, v15.4s, v0.s[2]\n"
+ "str s16, [x24, %[output_col_stride1]]\n"
+ "str s5, [x24, x11]\n"
+ "fmls v9.4s, v23.4s, v0.s[3]\n"
+ "str s6, [x24, x13]\n"
+ "fmls v10.4s, v24.4s, v0.s[3]\n"
+ "str s9, [x24, x23]\n"
+ "str s10, [x24, x15]\n"
+ "add x24, x24, #4\n"
+ "4:\n"
+ : [outptr0] "+r" (matrix_base),
+ [n_channels] "+r" (n_channels),
+ [inptr0] "+r" (input_base)
+ : [pcoeffs] "r" (pcoeffs),
+ [output_row_stride] "r" (6 * matrix_stride * sizeof(float)),
+ [output_col_stride1] "r" (matrix_stride * sizeof(float)),
+ [input_row_stride] "r" (input_row_stride * sizeof(float)),
+ [input_col_stride1] "r" (input_col_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
+ "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+ "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8",
+ "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
+ "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+}
+
+#else // __arm__ not __aarch64__
+
+template <>
+void InputTransform<6, 6, float, float, WinogradRoots::Integers>::transform_tile(
+ const int n_channels,
+ const float* const input_base,
+ const int input_row_stride,
+ const int input_col_stride,
+ float* outptr,
+ const int matrix_stride
+)
+{
+ constexpr int inner_tile_rows = 6;
+ constexpr int inner_tile_cols = 6;
+
+ // Get pointers into the input tile
+ const float *x_ptrs[inner_tile_rows][inner_tile_cols];
+ for (int i = 0, xi = 0; i < inner_tile_rows; i++, xi++)
+ {
+ // Get a pointer into the row
+ const float* const row_ptr = input_base + xi*input_row_stride;
+
+ for (int j = 0, xj = 0; j < inner_tile_cols; j++, xj++)
+ {
+ x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+ }
+ }
+
+ // Matrices used/computed in this kernel.
+ float x[inner_tile_rows][inner_tile_cols];
+ float XTx[inner_tile_rows][inner_tile_cols];
+ float U[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = XTx[i][j] = 0.0f;
+ }
+ }
+
+ // Perform the Winograd input transformation for each channel in the input
+ // tensor.
+ int channels_remaining = n_channels;
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used/computed in this kernel
+ float32x2_t x[inner_tile_rows][inner_tile_cols];
+ float32x2_t XTx[inner_tile_rows][inner_tile_cols];
+ float32x2_t U[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = vdup_n_f32(0.0f);
+ XTx[i][j] = vdup_n_f32(0.0f);
+ }
+ }
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = vld1_f32(x_ptrs[i][j]);
+ x_ptrs[i][j] += 2;
+ }
+ }
+
+ // Compute XT . x
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
+ XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
+
+ // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
+ XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
+
+ // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
+ XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
+
+ // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
+ XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
+
+ // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
+ XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
+
+ // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
+ XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
+ }
+
+ // Compute U = XT . x . X
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
+ U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
+
+ // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
+ U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+ // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
+ U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+ // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
+ U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
+
+ // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
+ U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
+
+ // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
+ U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
+ }
+
+ // Store the transformed matrix
+ for (int i = 0, m = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++, m++)
+ {
+ vst1_f32(outptr + m*matrix_stride, U[i][j]);
+ }
+ }
+ outptr += 2;
+ }
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load x
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = *(x_ptrs[i][j]++);
+ }
+ }
+
+ // Compute XT . x
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
+ XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
+ XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
+ XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
+ XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
+ XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
+ }
+
+ // Compute U = XT . x . X
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
+ U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
+ U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
+ U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
+ U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
+ U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
+ }
+
+ // Store the transformed matrix
+ for (int i = 0, m = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++, m++)
+ {
+ *(outptr + m*matrix_stride) = U[i][j];
+ }
+ }
+ outptr++;
+ }
+}
+
+#endif
+
+template class InputTransform<6, 6, float, float, WinogradRoots::Integers>;
+
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp
new file mode 100644
index 0000000..e45f186
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include "winograd.hpp"
+using namespace winograd;
+
+#define MEMBERFN(RTYPE) template <\
+ int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE WeightTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, TIn, TOut, Roots>
+
+MEMBERFN()::WeightTransform(
+ const int n_output_channels,
+ const int n_input_channels
+) : _n_output_channels(n_output_channels), _n_input_channels(n_input_channels),
+ _matrices(nullptr), _matrix_stride(0), _matrix_row_stride(0), _weights(nullptr)
+{
+
+}
+
+MEMBERFN(void)::set_weight_tensor(const void * const weights)
+{
+ _weights = static_cast<const TIn *>(weights);
+}
+
+MEMBERFN(void)::set_output_matrices(void * const mptr, const int ldmatrix, const int ldrow)
+{
+ _matrices = static_cast<TOut *>(mptr);
+ _matrix_stride = ldmatrix;
+ _matrix_row_stride = ldrow;
+}
+
+MEMBERFN(size_t)::get_working_space_size(unsigned int) const
+{
+ return 0;
+}
+
+MEMBERFN(void)::set_working_space(void *)
+{
+}
+
+MEMBERFN(unsigned int)::get_window(void) const
+{
+ // TODO When the weights transform supports multithreading, return the number
+ // of output channels. For now we return 1 to indicate that the weights must
+ // be transformed as a single block.
+ // return n_output_channels;
+ return 1;
+}
+
+MEMBERFN(void)::run(const unsigned int, const unsigned int, unsigned int)
+{
+ execute(
+ _n_output_channels, _n_input_channels, _weights,
+ _matrices, _matrix_stride, _matrix_row_stride
+ );
+}
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp
new file mode 100644
index 0000000..d97af21
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include "winograd.hpp"
+#include "padding.hpp"
+#include "utils.hpp"
+
+#define MEMBERFN(RTYPE) template<\
+ int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols,\
+ typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE OutputTransform<KernelRows, KernelCols, InnerTileRows, InnerTileCols, TIn, TOut, Roots>
+
+#define Nx1MEMBERFN(RTYPE) template<\
+ int KernelRows, int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots\
+> RTYPE OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots>
+
+namespace winograd
+{
+
+MEMBERFN()::OutputTransform(
+ const int n_batches,
+ const int n_rows,
+ const int n_cols,
+ const int n_channels
+) : _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), _n_channels(n_channels),
+ _matrix_base(nullptr),
+ _biases(nullptr),
+ _matrix_stride(0), _matrix_row_stride(0), _matrix_batch_stride(0),
+ _outptr(nullptr),
+ _tiles_M(iceildiv(n_rows, output_tile_rows)),
+ _tiles_N(iceildiv(n_cols, output_tile_cols)),
+ _out_col_stride(0), _out_row_stride(0), _out_batch_stride(0),
+ _working_space_col_stride(n_channels),
+ _working_space_row_stride(output_tile_cols * _working_space_col_stride),
+ _working_space(nullptr)
+{
+}
+
+MEMBERFN(void)::set_input_matrices(const void * const mptr, const int ldmatrix, const int ldrow)
+{
+ _matrix_base = static_cast<const TIn *>(mptr);
+ _matrix_stride = ldmatrix;
+ _matrix_row_stride = ldrow;
+ _matrix_batch_stride = _tiles_M * _tiles_N * ldrow;
+}
+
+MEMBERFN(void)::set_bias(const void * const bias)
+{
+ _biases = static_cast<const TOut *>(bias);
+}
+
+MEMBERFN(void)::set_output_tensor(void * const outptr)
+{
+ set_output_tensor(outptr, _n_channels);
+}
+
+MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldcol)
+{
+ set_output_tensor(outptr, _n_cols * ldcol, ldcol);
+}
+
+MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldrow, const int ldcol)
+{
+ set_output_tensor(outptr, _n_rows * ldrow, ldrow, ldcol);
+}
+
+MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldbatch, const int ldrow, const int ldcol)
+{
+ _outptr = static_cast<TOut *>(outptr);
+ _out_batch_stride = ldbatch;
+ _out_row_stride = ldrow;
+ _out_col_stride = ldcol;
+}
+
+Nx1MEMBERFN()::OutputTransform(
+ const int n_batches,
+ const int n_rows,
+ const int n_cols,
+ const int n_channels
+) : OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::OutputTransform(
+ n_batches, n_cols, n_rows, n_channels /* Transpose rows and columns */
+ )
+{
+}
+
+Nx1MEMBERFN(void)::set_output_tensor(void * const outptr)
+{
+ set_output_tensor(outptr, this->_n_channels);
+}
+
+Nx1MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldcol)
+{
+ set_output_tensor(outptr, this->_n_cols * ldcol, ldcol);
+}
+
+Nx1MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldrow, const int ldcol)
+{
+ set_output_tensor(outptr, this->_n_rows * ldrow, ldrow, ldcol);
+}
+
+Nx1MEMBERFN(void)::set_output_tensor(void * const outptr, const int ldbatch, const int ldrow, const int ldcol)
+{
+ // Transpose rows and columns
+ Base::set_output_tensor(outptr, ldbatch, ldcol, ldrow);
+}
+
+MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
+{
+ return sizeof(TOut) * output_tile_rows * _working_space_row_stride * nthreads;
+}
+
+MEMBERFN(void)::set_working_space(void * const buffer)
+{
+ _working_space = static_cast<TOut *>(buffer);
+}
+
+MEMBERFN(unsigned int)::get_window(void) const
+{
+ return iceildiv(_n_channels, WINDOW_BLOCK);
+}
+
+MEMBERFN(void)::run(
+ const unsigned int start,
+ const unsigned int stop,
+ const unsigned int threadid
+)
+{
+ // Determine the channels on which to work
+ if (start >= get_window())
+ {
+ return; // No work to do beyond the end of the window
+ }
+ const unsigned int start_channel = start * WINDOW_BLOCK;
+ const unsigned int stop_channel = std::min<unsigned int>(_n_channels, stop * WINDOW_BLOCK);
+ const unsigned int n_channels = stop_channel - start_channel;
+
+ const auto matrix_tile_col_stride = _matrix_row_stride;
+ const auto matrix_tile_row_stride = _tiles_N * matrix_tile_col_stride;
+
+ const TOut* const bptr = (_biases == nullptr) ? nullptr : _biases + start_channel;
+
+ // Loop over batches
+ for (int batch = 0; batch < _n_batches; batch++)
+ {
+ const TIn* const matrix_batch = _matrix_base + start_channel + batch * _matrix_batch_stride;
+ TOut* const outptr_batch = _outptr + start_channel + batch * _out_batch_stride;
+
+ for (int tile_i = 0; tile_i < _tiles_M; tile_i++)
+ {
+ // Compute properties of the row of output tiles
+ const int row_pad_bottom = std::max(0, (tile_i + 1)*output_tile_rows - _n_rows);
+ const TIn* const matrix_tile_row = matrix_batch + tile_i * matrix_tile_row_stride;
+ TOut* const outptr_row = outptr_batch + tile_i * output_tile_rows * _out_row_stride;
+
+ for (int tile_j = 0; tile_j < _tiles_N; tile_j++)
+ {
+ // Compute property of this specific tile
+ const int tile_pad_right = std::max(0, (tile_j + 1)*output_tile_cols - _n_cols);
+ const TIn* const matrix_tile = matrix_tile_row + tile_j * matrix_tile_col_stride;
+ TOut* const outptr_tile = outptr_row + tile_j * output_tile_cols * _out_col_stride;
+
+ // Perform the transformation
+ if (row_pad_bottom || tile_pad_right)
+ {
+ transform_cropped_tile(
+ threadid, n_channels, outptr_tile, matrix_tile, bptr,
+ row_pad_bottom, tile_pad_right
+ );
+ }
+ else
+ {
+ transform_uncropped_tile(
+ threadid, n_channels, outptr_tile, matrix_tile, bptr
+ );
+ }
+ }
+ }
+ }
+}
+
+MEMBERFN(void)::transform_uncropped_tile(
+ const unsigned int /* threadid unused */,
+ const int n_channels,
+ TOut * const outptr,
+ const TIn * const inptr,
+ const TOut * const biases
+)
+{
+ transform_tile(
+ n_channels, inptr, _matrix_stride, biases,
+ outptr, _out_row_stride, _out_col_stride
+ );
+}
+
+MEMBERFN(void)::transform_cropped_tile(
+ const unsigned int threadid,
+ const int n_channels,
+ TOut * const outptr,
+ const TIn * const inptr,
+ const TOut * const biases,
+ const int pad_bottom,
+ const int pad_right
+)
+{
+ // Transform into working space and then copy the relevant section out.
+ TOut *wsptr = static_cast<TOut *>(get_working_space(threadid));
+ transform_tile(
+ n_channels, inptr, _matrix_stride, biases,
+ wsptr, _working_space_row_stride, _working_space_col_stride
+ );
+
+ padding::crop_and_copy_tile(
+ output_tile_rows, output_tile_cols, n_channels,
+ wsptr, _working_space_row_stride, _working_space_col_stride,
+ outptr, _out_row_stride, _out_col_stride,
+ 0u, 0u, pad_bottom, pad_right
+ );
+}
+
+MEMBERFN(void *)::get_working_space(const unsigned int threadid) const
+{
+ return _working_space + output_tile_rows * _working_space_row_stride * threadid;
+}
+
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
similarity index 71%
rename from src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
index ea842a4..c32d7f2 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,43 +22,29 @@
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm.hpp"
+#include "output.hpp"
-namespace
+namespace winograd
{
-template <bool Specialized, int PadRight=0>
-void winograd_output_transform_2_7_fp32_process_tile(
+template <>
+void OutputTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
const int n_channels,
- const float* const matrix_base,
+ const float* inptr,
const int matrix_stride,
- const float* const biases,
+ const float* bptr,
float* const output,
- const int output_row_stride,
- const int output_col_stride,
- const int _pad_bottom,
- const int _pad_right
+ const int, // No need to stride across rows
+ const int output_col_stride
)
{
- (void) output_row_stride;
- (void) _pad_bottom;
- constexpr int output_tile_cols = 2;
- constexpr int inner_tile_cols = 8;
-
- const int pad_right = Specialized ? PadRight : _pad_right;
- const int cells_j = output_tile_cols - pad_right;
-
-
// Construct a map to the output cells
- float *outptrs[cells_j];
- for (int j = 0; j < cells_j; j++)
+ float *outptrs[output_tile_cols];
+ for (int j = 0; j < output_tile_cols; j++)
{
outptrs[j] = output + j*output_col_stride;
}
- const float *inptr = matrix_base;
- const float *bptr = biases;
// For each channel of the output
int channels_remaining = n_channels;
@@ -84,7 +70,7 @@
b = vld1q_f32(bptr);
bptr += 4;
}
- for (int j = 0; j < cells_j; j++)
+ for (int j = 0; j < output_tile_cols; j++)
{
vst1q_f32(outptrs[j], f[j] + b);
outptrs[j] += 4;
@@ -111,7 +97,7 @@
b = vld1_f32(bptr);
bptr += 2;
}
- for (int j = 0; j < cells_j; j++)
+ for (int j = 0; j < output_tile_cols; j++)
{
vst1_f32(outptrs[j], f[j] + b);
outptrs[j] += 2;
@@ -138,26 +124,14 @@
{
b = *(bptr++);
}
- for (int j = 0; j < cells_j; j++)
+ for (int j = 0; j < output_tile_cols; j++)
{
*(outptrs[j]++) = f[j] + b;
}
}
}
-} // namespace (anonymous)
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<1, 7, 1, 8, float>;
+template class OutputTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>;
+template class OutputTransform<7, 1, 8, 1, float, float, WinogradRoots::Integers>;
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2_7_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
- winograd_output_transform_2_7_fp32_process_tile<true, 1>
-};
-
-template class OutputTransform<1, 7, 1, 8, float>;
-template class OutputTransform<7, 1, 8, 1, float>;
} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..d6ebf44
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "output.hpp"
+
+namespace winograd
+{
+
+template <>
+void OutputTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>::transform_tile(
+ const int n_channels,
+ const float* inptr,
+ const int matrix_stride,
+ const float* bptr,
+ float* const output,
+ const int output_row_stride,
+ const int output_col_stride
+)
+{
+ // Construct a map to the output cells
+ float *outptrs[output_tile_rows][output_tile_cols];
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+ }
+ }
+
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used and computed during this transform
+ float32x4_t F[4][4], FZ[4][2], f[2][2], b;
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
+ {
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 4;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
+
+ // Load the bias vector
+ if (bptr != nullptr)
+ {
+ b = vld1q_f32(bptr);
+ bptr += 4;
+ }
+ else
+ {
+ b = vdupq_n_f32(0.0f);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+ outptrs[i][j] += 4;
+ }
+ }
+ }
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed during this transform
+ float32x2_t F[4][4], FZ[4][2], f[2][2], b;
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
+ {
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
+
+ // Load the bias vector
+ if (bptr != nullptr)
+ {
+ b = vld1_f32(bptr);
+ bptr += 2;
+ }
+ else
+ {
+ b = vdup_n_f32(0.0f);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed during this transform
+ float F[4][4], FZ[4][2], f[2][2], b;
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
+ {
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ }
+
+ // Load the bias
+ if (bptr != nullptr)
+ {
+ b = *(bptr++);
+ }
+ else
+ {
+ b = 0.0f;
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j] + b;
+ }
+ }
+ }
+}
+
+template class OutputTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>;
+
+} // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..d93d9e2
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "output.hpp"
+#include "arm.hpp"
+
+namespace winograd
+{
+
+template <>
+void OutputTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>::transform_tile(
+ const int n_channels,
+ const float* inptr,
+ const int matrix_stride,
+ const float* bptr,
+ float* const output,
+ const int output_row_stride,
+ const int output_col_stride
+)
+{
+ // Construct a map to the output cells
+ float *outptrs[output_tile_rows][output_tile_cols];
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+ }
+ }
+
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used and computed during this transform
+ float32x4_t F[6][6], FZ[6][2], f[2][2], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 4;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ if (bptr != nullptr)
+ {
+ b = vld1q_f32(bptr);
+ bptr += 4;
+ }
+ else
+ {
+ b = vdupq_n_f32(0.0f);
+ }
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+ outptrs[i][j] += 4;
+ }
+ }
+ }
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][2], f[2][2], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ if (bptr != nullptr)
+ {
+ b = vld1_f32(bptr);
+ bptr += 2;
+ }
+ else
+ {
+ b = vdup_n_f32(0.0f);
+ }
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][2], f[2][2], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ if (bptr != nullptr)
+ {
+ b = *(bptr++);
+ }
+ else
+ {
+ b = 0.0f;
+ }
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j] + b;
+ }
+ }
+ }
+}
+
+template class OutputTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>;
+
+} // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
similarity index 73%
rename from src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
index 911759b..7187ef2 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,42 +22,29 @@
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "output.hpp"
+#include "arm.hpp"
-namespace
+namespace winograd
{
-template <bool Specialized, int PadRight=0>
-void winograd_output_transform_4_5_fp32_process_tile(
+template <>
+void OutputTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
const int n_channels,
- const float* const matrix_base,
+ const float* inptr,
const int matrix_stride,
- const float* const biases,
+ const float* bptr,
float* const output,
- const int output_row_stride,
- const int output_col_stride,
- const int _pad_bottom,
- const int _pad_right
+ const int, // No need to stride across rows
+ const int output_col_stride
)
{
- (void) output_row_stride;
- (void) _pad_bottom;
- constexpr int output_tile_cols = 4;
- constexpr int inner_tile_cols = 8;
-
- const int pad_right = Specialized ? PadRight : _pad_right;
- const int cells_j = output_tile_cols - pad_right;
-
// Construct a map to the output cells
- float *outptrs[cells_j];
- for (int j = 0; j < cells_j; j++)
+ float *outptrs[output_tile_cols];
+ for (int j = 0; j < output_tile_cols; j++)
{
outptrs[j] = output + j*output_col_stride;
}
- const float *inptr = matrix_base;
- const float *bptr = biases;
// For each channel of the output
int channels_remaining = n_channels;
@@ -85,7 +72,7 @@
b = vld1q_f32(bptr);
bptr += 4;
}
- for (int j = 0; j < cells_j; j++)
+ for (int j = 0; j < output_tile_cols; j++)
{
vst1q_f32(outptrs[j], f[j] + b);
outptrs[j] += 4;
@@ -114,7 +101,7 @@
b = vld1_f32(bptr);
bptr += 2;
}
- for (int j = 0; j < cells_j; j++)
+ for (int j = 0; j < output_tile_cols; j++)
{
vst1_f32(outptrs[j], f[j] + b);
outptrs[j] += 2;
@@ -143,29 +130,14 @@
{
b = *(bptr++);
}
- for (int j = 0; j < cells_j; j++)
+ for (int j = 0; j < output_tile_cols; j++)
{
*(outptrs[j]++) = f[j] + b;
}
}
}
-} // namespace (anonymous)
+template class OutputTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>;
+template class OutputTransform<5, 1, 8, 1, float, float, WinogradRoots::Integers>;
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<1, 5, 1, 8, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4_5_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
- winograd_output_transform_4_5_fp32_process_tile<true, 1>,
- winograd_output_transform_4_5_fp32_process_tile<true, 2>,
- winograd_output_transform_4_5_fp32_process_tile<true, 3>
-};
-
-template class OutputTransform<1, 5, 1, 8, float>;
-template class OutputTransform<5, 1, 8, 1, float>;
} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..fd16a4d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp
@@ -0,0 +1,1855 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "output.hpp"
+
+namespace winograd
+{
+
+#ifdef __aarch64__
+
+template <>
+void OutputTransform<3, 3, 6, 6, float, float, winograd::WinogradRoots::Integers>::transform_tile(
+ int n_channels,
+ const float* inptr,
+ const int matrix_stride,
+ const float* bptr,
+ float* output,
+ const int output_row_stride,
+ const int output_col_stride
+)
+{
+ const float coeffs[2] = {2.0f, 4.0f};
+ if (bptr != nullptr)
+ {
+ __asm__ __volatile__ (
+ "ldr d0, [%[pcoeffs]]\n"
+ "add x21, %[in_col_stride1], %[in_col_stride1]\n"
+ "add x22, x21, %[in_col_stride1]\n"
+ "add x25, %[inptr0], %[in_row_stride]\n"
+ "add x15, %[output_col_stride1], %[output_col_stride1]\n"
+ "add x23, x22, %[in_col_stride1]\n"
+ "add x13, x25, %[in_row_stride]\n"
+ "add x16, x15, %[output_col_stride1]\n"
+ "add x24, x23, %[in_col_stride1]\n"
+ "add x26, x13, %[in_row_stride]\n"
+ "add x17, %[outptr0], %[output_row_stride]\n"
+ "add x14, x26, %[in_row_stride]\n"
+ "add x28, x17, %[output_row_stride]\n"
+ "lsr x19, %[n_channels], #2\n"
+ "add x27, x14, %[in_row_stride]\n"
+ "add x18, x28, %[output_row_stride]\n"
+ "and x20, %[n_channels], #3\n"
+ "cbz x19, 4f\n"
+ "1:\n"
+ "ldr q19, [%[inptr0]]\n"
+ "subs x19, x19, #1\n"
+ "ldr q20, [%[inptr0], %[in_col_stride1]]\n"
+ "ldr q4, [%[inptr0], x21]\n"
+ "fadd v1.4s, v20.4s, v4.4s\n"
+ "ldr q17, [%[inptr0], x22]\n"
+ "fsub v7.4s, v20.4s, v4.4s\n"
+ "ldr q22, [%[inptr0], x23]\n"
+ "fadd v5.4s, v17.4s, v22.4s\n"
+ "ldr q18, [%[inptr0], x24]\n"
+ "fsub v10.4s, v17.4s, v22.4s\n"
+ "ldr q25, [x25]\n"
+ "fadd v8.4s, v19.4s, v1.4s\n"
+ "ldr q12, [x25, %[in_col_stride1]]\n"
+ "mov v4.16b, v1.16b\n"
+ "ldr q23, [x25, x21]\n"
+ "mov v1.16b, v7.16b\n"
+ "ldr q9, [x25, x22]\n"
+ "fmul v10.4s, v10.4s, v0.s[0]\n"
+ "ldr q11, [x25, x23]\n"
+ "fadd v8.4s, v8.4s, v5.4s\n"
+ "ldr q6, [x25, x24]\n"
+ "fmla v4.4s, v5.4s, v0.s[1]\n"
+ "fadd v7.4s, v7.4s, v10.4s\n"
+ "fmla v1.4s, v10.4s, v0.s[1]\n"
+ "fadd v1.4s, v1.4s, v18.4s\n"
+ "beq 3f\n"
+ "2:\n"
+ "fadd v3.4s, v12.4s, v23.4s\n"
+ "ldr q2, [x13]\n"
+ "fadd v27.4s, v9.4s, v11.4s\n"
+ "ldr q21, [x13, %[in_col_stride1]]\n"
+ "fsub v16.4s, v12.4s, v23.4s\n"
+ "ldr q26, [x13, x21]\n"
+ "fsub v9.4s, v9.4s, v11.4s\n"
+ "ldr q17, [x13, x22]\n"
+ "fadd v14.4s, v25.4s, v3.4s\n"
+ "ldr q19, [x13, x23]\n"
+ "mov v11.16b, v3.16b\n"
+ "ldr q10, [x13, x24]\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr q15, [x26]\n"
+ "fmul v9.4s, v9.4s, v0.s[0]\n"
+ "ldr q12, [x26, %[in_col_stride1]]\n"
+ "fadd v14.4s, v14.4s, v27.4s\n"
+ "ldr q20, [x26, x21]\n"
+ "fmla v11.4s, v27.4s, v0.s[1]\n"
+ "ldr q24, [x26, x22]\n"
+ "fadd v23.4s, v21.4s, v26.4s\n"
+ "ldr q29, [x26, x23]\n"
+ "fadd v13.4s, v16.4s, v9.4s\n"
+ "ldr q5, [x26, x24]\n"
+ "fmla v3.4s, v9.4s, v0.s[1]\n"
+ "ldr q18, [x14]\n"
+ "fadd v30.4s, v17.4s, v19.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fadd v16.4s, v2.4s, v23.4s\n"
+ "add x25, x25, #16\n"
+ "fsub v21.4s, v21.4s, v26.4s\n"
+ "ldr q22, [x14, %[in_col_stride1]]\n"
+ "fadd v3.4s, v3.4s, v6.4s\n"
+ "ldr q28, [x14, x21]\n"
+ "fsub v19.4s, v17.4s, v19.4s\n"
+ "add x13, x13, #16\n"
+ "fadd v16.4s, v16.4s, v30.4s\n"
+ "add x26, x26, #16\n"
+ "mov v17.16b, v23.16b\n"
+ "subs x19, x19, #1\n"
+ "fadd v26.4s, v12.4s, v20.4s\n"
+ "fsub v9.4s, v12.4s, v20.4s\n"
+ "fmul v19.4s, v19.4s, v0.s[0]\n"
+ "ldr q20, [x14, x22]\n"
+ "fmla v17.4s, v30.4s, v0.s[1]\n"
+ "fadd v25.4s, v24.4s, v29.4s\n"
+ "fsub v12.4s, v24.4s, v29.4s\n"
+ "fadd v24.4s, v22.4s, v28.4s\n"
+ "fadd v23.4s, v15.4s, v26.4s\n"
+ "mov v15.16b, v26.16b\n"
+ "fsub v22.4s, v22.4s, v28.4s\n"
+ "fadd v29.4s, v14.4s, v16.4s\n"
+ "fsub v16.4s, v14.4s, v16.4s\n"
+ "ldr q28, [x14, x23]\n"
+ "fmul v12.4s, v12.4s, v0.s[0]\n"
+ "fmla v15.4s, v25.4s, v0.s[1]\n"
+ "fadd v23.4s, v23.4s, v25.4s\n"
+ "mov v6.16b, v21.16b\n"
+ "fadd v30.4s, v21.4s, v19.4s\n"
+ "fadd v26.4s, v18.4s, v24.4s\n"
+ "mov v25.16b, v24.16b\n"
+ "fadd v18.4s, v8.4s, v29.4s\n"
+ "fmla v6.4s, v19.4s, v0.s[1]\n"
+ "fadd v27.4s, v20.4s, v28.4s\n"
+ "fsub v21.4s, v20.4s, v28.4s\n"
+ "mov v19.16b, v29.16b\n"
+ "fadd v29.4s, v13.4s, v30.4s\n"
+ "fsub v8.4s, v13.4s, v30.4s\n"
+ "fadd v14.4s, v9.4s, v12.4s\n"
+ "fadd v6.4s, v6.4s, v10.4s\n"
+ "ldr q20, [x14, x24]\n"
+ "fadd v26.4s, v26.4s, v27.4s\n"
+ "add x14, x14, #16\n"
+ "fmla v9.4s, v12.4s, v0.s[1]\n"
+ "ldr q24, [x27]\n"
+ "fmul v21.4s, v21.4s, v0.s[0]\n"
+ "fmla v25.4s, v27.4s, v0.s[1]\n"
+ "fadd v10.4s, v7.4s, v29.4s\n"
+ "ldr q2, [%[bptr]]\n"
+ "mov v7.16b, v29.16b\n"
+ "add %[bptr], %[bptr], #16\n"
+ "fadd v9.4s, v9.4s, v5.4s\n"
+ "fadd v13.4s, v23.4s, v26.4s\n"
+ "fsub v23.4s, v23.4s, v26.4s\n"
+ "fadd v27.4s, v11.4s, v17.4s\n"
+ "fsub v11.4s, v11.4s, v17.4s\n"
+ "fadd v30.4s, v15.4s, v25.4s\n"
+ "fsub v15.4s, v15.4s, v25.4s\n"
+ "ldr q28, [x27, %[in_col_stride1]]\n"
+ "fadd v18.4s, v18.4s, v13.4s\n"
+ "fmla v19.4s, v13.4s, v0.s[1]\n"
+ "fadd v26.4s, v22.4s, v21.4s\n"
+ "mov v12.16b, v22.16b\n"
+ "fmul v23.4s, v23.4s, v0.s[0]\n"
+ "fadd v17.4s, v4.4s, v27.4s\n"
+ "fmul v15.4s, v15.4s, v0.s[0]\n"
+ "mov v4.16b, v27.16b\n"
+ "fmla v12.4s, v21.4s, v0.s[1]\n"
+ "ldr q22, [x27, x21]\n"
+ "fadd v18.4s, v18.4s, v2.4s\n"
+ "fadd v19.4s, v19.4s, v2.4s\n"
+ "fadd v17.4s, v17.4s, v30.4s\n"
+ "fmla v4.4s, v30.4s, v0.s[1]\n"
+ "fadd v25.4s, v28.4s, v22.4s\n"
+ "fsub v27.4s, v28.4s, v22.4s\n"
+ "fadd v12.4s, v12.4s, v20.4s\n"
+ "ldr q29, [x27, x22]\n"
+ "str q18, [%[outptr0]]\n"
+ "fadd v22.4s, v16.4s, v23.4s\n"
+ "str q19, [x28]\n"
+ "fadd v28.4s, v24.4s, v25.4s\n"
+ "ldr q30, [x27, x23]\n"
+ "fadd v20.4s, v29.4s, v30.4s\n"
+ "fsub v18.4s, v29.4s, v30.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "ldr q25, [x27, x24]\n"
+ "fmla v16.4s, v23.4s, v0.s[1]\n"
+ "ldr q19, [%[inptr0]]\n"
+ "fadd v17.4s, v17.4s, v2.4s\n"
+ "add x27, x27, #16\n"
+ "fadd v28.4s, v28.4s, v20.4s\n"
+ "fmul v18.4s, v18.4s, v0.s[0]\n"
+ "fmla v21.4s, v20.4s, v0.s[1]\n"
+ "ldr q20, [%[inptr0], %[in_col_stride1]]\n"
+ "fadd v22.4s, v22.4s, v2.4s\n"
+ "fadd v4.4s, v4.4s, v2.4s\n"
+ "str q17, [%[outptr0], x15]\n"
+ "mov v24.16b, v27.16b\n"
+ "fadd v23.4s, v27.4s, v18.4s\n"
+ "fadd v16.4s, v16.4s, v28.4s\n"
+ "fadd v13.4s, v14.4s, v26.4s\n"
+ "fsub v30.4s, v14.4s, v26.4s\n"
+ "str q22, [x17]\n"
+ "fmla v24.4s, v18.4s, v0.s[1]\n"
+ "str q4, [x28, x15]\n"
+ "mov v14.16b, v8.16b\n"
+ "fadd v29.4s, v11.4s, v15.4s\n"
+ "ldr q4, [%[inptr0], x21]\n"
+ "fadd v10.4s, v10.4s, v13.4s\n"
+ "ldr q17, [%[inptr0], x22]\n"
+ "fadd v24.4s, v24.4s, v25.4s\n"
+ "ldr q22, [%[inptr0], x23]\n"
+ "fmul v30.4s, v30.4s, v0.s[0]\n"
+ "fmla v7.4s, v13.4s, v0.s[1]\n"
+ "mov v26.16b, v11.16b\n"
+ "fadd v13.4s, v3.4s, v6.4s\n"
+ "fsub v3.4s, v3.4s, v6.4s\n"
+ "ldr q18, [%[inptr0], x24]\n"
+ "fadd v10.4s, v10.4s, v2.4s\n"
+ "fadd v29.4s, v29.4s, v2.4s\n"
+ "fadd v8.4s, v8.4s, v30.4s\n"
+ "fmla v14.4s, v30.4s, v0.s[1]\n"
+ "fmla v26.4s, v15.4s, v0.s[1]\n"
+ "ldr q25, [x25]\n"
+ "fadd v27.4s, v9.4s, v12.4s\n"
+ "fadd v1.4s, v1.4s, v13.4s\n"
+ "str q10, [%[outptr0], %[output_col_stride1]]\n"
+ "fsub v6.4s, v9.4s, v12.4s\n"
+ "str q29, [x17, x15]\n"
+ "fadd v14.4s, v14.4s, v23.4s\n"
+ "fadd v26.4s, v26.4s, v21.4s\n"
+ "ldr q12, [x25, %[in_col_stride1]]\n"
+ "fadd v1.4s, v1.4s, v27.4s\n"
+ "ldr q23, [x25, x21]\n"
+ "fmul v6.4s, v6.4s, v0.s[0]\n"
+ "ldr q9, [x25, x22]\n"
+ "mov v5.16b, v13.16b\n"
+ "ldr q11, [x25, x23]\n"
+ "mov v13.16b, v3.16b\n"
+ "fadd v8.4s, v8.4s, v2.4s\n"
+ "fadd v1.4s, v1.4s, v2.4s\n"
+ "fadd v7.4s, v7.4s, v2.4s\n"
+ "fadd v10.4s, v3.4s, v6.4s\n"
+ "fmla v5.4s, v27.4s, v0.s[1]\n"
+ "fmla v13.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x25, x24]\n"
+ "str q8, [x17, %[output_col_stride1]]\n"
+ "fadd v16.4s, v16.4s, v2.4s\n"
+ "str q1, [%[outptr0], x16]\n"
+ "fadd v14.4s, v14.4s, v2.4s\n"
+ "str q7, [x28, %[output_col_stride1]]\n"
+ "fadd v10.4s, v10.4s, v2.4s\n"
+ "fadd v13.4s, v13.4s, v24.4s\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "str q16, [x18]\n"
+ "fadd v5.4s, v5.4s, v2.4s\n"
+ "str q14, [x18, %[output_col_stride1]]\n"
+ "fadd v26.4s, v26.4s, v2.4s\n"
+ "str q10, [x17, x16]\n"
+ "fadd v1.4s, v20.4s, v4.4s\n"
+ "fadd v13.4s, v13.4s, v2.4s\n"
+ "add x17, x17, #16\n"
+ "str q5, [x28, x16]\n"
+ "fadd v5.4s, v17.4s, v22.4s\n"
+ "str q26, [x18, x15]\n"
+ "fsub v7.4s, v20.4s, v4.4s\n"
+ "fadd v8.4s, v19.4s, v1.4s\n"
+ "add x28, x28, #16\n"
+ "str q13, [x18, x16]\n"
+ "mov v4.16b, v1.16b\n"
+ "fsub v10.4s, v17.4s, v22.4s\n"
+ "add x18, x18, #16\n"
+ "mov v1.16b, v7.16b\n"
+ "fadd v8.4s, v8.4s, v5.4s\n"
+ "fmla v4.4s, v5.4s, v0.s[1]\n"
+ "fmul v10.4s, v10.4s, v0.s[0]\n"
+ "fadd v7.4s, v7.4s, v10.4s\n"
+ "fmla v1.4s, v10.4s, v0.s[1]\n"
+ "fadd v1.4s, v1.4s, v18.4s\n"
+ "bne 2b\n"
+ "3:\n"
+ "fadd v3.4s, v12.4s, v23.4s\n"
+ "ldr q2, [x13]\n"
+ "fadd v27.4s, v9.4s, v11.4s\n"
+ "ldr q21, [x13, %[in_col_stride1]]\n"
+ "fsub v16.4s, v12.4s, v23.4s\n"
+ "ldr q26, [x13, x21]\n"
+ "fsub v9.4s, v9.4s, v11.4s\n"
+ "ldr q17, [x13, x22]\n"
+ "fadd v14.4s, v25.4s, v3.4s\n"
+ "ldr q19, [x13, x23]\n"
+ "mov v11.16b, v3.16b\n"
+ "ldr q10, [x13, x24]\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr q15, [x26]\n"
+ "fmul v9.4s, v9.4s, v0.s[0]\n"
+ "ldr q12, [x26, %[in_col_stride1]]\n"
+ "fadd v14.4s, v14.4s, v27.4s\n"
+ "ldr q20, [x26, x21]\n"
+ "fmla v11.4s, v27.4s, v0.s[1]\n"
+ "ldr q24, [x26, x22]\n"
+ "fadd v23.4s, v21.4s, v26.4s\n"
+ "ldr q29, [x26, x23]\n"
+ "fadd v13.4s, v16.4s, v9.4s\n"
+ "ldr q5, [x26, x24]\n"
+ "fmla v3.4s, v9.4s, v0.s[1]\n"
+ "ldr q18, [x14]\n"
+ "fadd v30.4s, v17.4s, v19.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fadd v16.4s, v2.4s, v23.4s\n"
+ "add x25, x25, #16\n"
+ "fsub v21.4s, v21.4s, v26.4s\n"
+ "ldr q22, [x14, %[in_col_stride1]]\n"
+ "fadd v3.4s, v3.4s, v6.4s\n"
+ "ldr q28, [x14, x21]\n"
+ "fsub v19.4s, v17.4s, v19.4s\n"
+ "add x13, x13, #16\n"
+ "fadd v16.4s, v16.4s, v30.4s\n"
+ "add x26, x26, #16\n"
+ "mov v17.16b, v23.16b\n"
+ "fadd v26.4s, v12.4s, v20.4s\n"
+ "fsub v9.4s, v12.4s, v20.4s\n"
+ "ldr q2, [%[bptr]]\n"
+ "fmul v19.4s, v19.4s, v0.s[0]\n"
+ "add %[bptr], %[bptr], #16\n"
+ "fmla v17.4s, v30.4s, v0.s[1]\n"
+ "fadd v25.4s, v24.4s, v29.4s\n"
+ "fadd v23.4s, v15.4s, v26.4s\n"
+ "fsub v12.4s, v24.4s, v29.4s\n"
+ "mov v15.16b, v26.16b\n"
+ "fadd v24.4s, v22.4s, v28.4s\n"
+ "fsub v22.4s, v22.4s, v28.4s\n"
+ "fadd v29.4s, v14.4s, v16.4s\n"
+ "fsub v16.4s, v14.4s, v16.4s\n"
+ "ldr q20, [x14, x22]\n"
+ "fadd v23.4s, v23.4s, v25.4s\n"
+ "fmul v12.4s, v12.4s, v0.s[0]\n"
+ "fmla v15.4s, v25.4s, v0.s[1]\n"
+ "mov v6.16b, v21.16b\n"
+ "fadd v30.4s, v21.4s, v19.4s\n"
+ "fadd v26.4s, v18.4s, v24.4s\n"
+ "mov v25.16b, v24.16b\n"
+ "fadd v18.4s, v8.4s, v29.4s\n"
+ "fmla v6.4s, v19.4s, v0.s[1]\n"
+ "mov v19.16b, v29.16b\n"
+ "fadd v27.4s, v11.4s, v17.4s\n"
+ "fsub v11.4s, v11.4s, v17.4s\n"
+ "fadd v29.4s, v13.4s, v30.4s\n"
+ "fsub v8.4s, v13.4s, v30.4s\n"
+ "fadd v14.4s, v9.4s, v12.4s\n"
+ "fadd v6.4s, v6.4s, v10.4s\n"
+ "ldr q28, [x14, x23]\n"
+ "fadd v17.4s, v4.4s, v27.4s\n"
+ "mov v4.16b, v27.16b\n"
+ "fmla v9.4s, v12.4s, v0.s[1]\n"
+ "fadd v27.4s, v20.4s, v28.4s\n"
+ "fsub v21.4s, v20.4s, v28.4s\n"
+ "fadd v10.4s, v7.4s, v29.4s\n"
+ "mov v7.16b, v29.16b\n"
+ "fadd v13.4s, v3.4s, v6.4s\n"
+ "fsub v3.4s, v3.4s, v6.4s\n"
+ "ldr q20, [x14, x24]\n"
+ "fadd v9.4s, v9.4s, v5.4s\n"
+ "fadd v26.4s, v26.4s, v27.4s\n"
+ "fmul v21.4s, v21.4s, v0.s[0]\n"
+ "add x14, x14, #16\n"
+ "fmla v25.4s, v27.4s, v0.s[1]\n"
+ "mov v12.16b, v22.16b\n"
+ "fadd v1.4s, v1.4s, v13.4s\n"
+ "mov v5.16b, v13.16b\n"
+ "fadd v13.4s, v23.4s, v26.4s\n"
+ "fsub v23.4s, v23.4s, v26.4s\n"
+ "fadd v26.4s, v22.4s, v21.4s\n"
+ "ldr q24, [x27]\n"
+ "fmla v12.4s, v21.4s, v0.s[1]\n"
+ "fadd v30.4s, v15.4s, v25.4s\n"
+ "fsub v15.4s, v15.4s, v25.4s\n"
+ "ldr q28, [x27, %[in_col_stride1]]\n"
+ "fadd v18.4s, v18.4s, v13.4s\n"
+ "fmul v23.4s, v23.4s, v0.s[0]\n"
+ "fmla v19.4s, v13.4s, v0.s[1]\n"
+ "ldr q22, [x27, x21]\n"
+ "fadd v12.4s, v12.4s, v20.4s\n"
+ "ldr q29, [x27, x22]\n"
+ "fadd v17.4s, v17.4s, v30.4s\n"
+ "fmul v15.4s, v15.4s, v0.s[0]\n"
+ "fmla v4.4s, v30.4s, v0.s[1]\n"
+ "fadd v25.4s, v28.4s, v22.4s\n"
+ "fsub v27.4s, v28.4s, v22.4s\n"
+ "fadd v22.4s, v16.4s, v23.4s\n"
+ "fadd v18.4s, v18.4s, v2.4s\n"
+ "fadd v17.4s, v17.4s, v2.4s\n"
+ "fadd v19.4s, v19.4s, v2.4s\n"
+ "fadd v28.4s, v24.4s, v25.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v16.4s, v23.4s, v0.s[1]\n"
+ "ldr q30, [x27, x23]\n"
+ "str q18, [%[outptr0]]\n"
+ "fadd v20.4s, v29.4s, v30.4s\n"
+ "str q17, [%[outptr0], x15]\n"
+ "fsub v18.4s, v29.4s, v30.4s\n"
+ "str q19, [x28]\n"
+ "mov v24.16b, v27.16b\n"
+ "fadd v13.4s, v14.4s, v26.4s\n"
+ "ldr q25, [x27, x24]\n"
+ "fadd v28.4s, v28.4s, v20.4s\n"
+ "add x27, x27, #16\n"
+ "fmul v18.4s, v18.4s, v0.s[0]\n"
+ "fmla v21.4s, v20.4s, v0.s[1]\n"
+ "fsub v30.4s, v14.4s, v26.4s\n"
+ "mov v14.16b, v8.16b\n"
+ "fadd v10.4s, v10.4s, v13.4s\n"
+ "fmla v7.4s, v13.4s, v0.s[1]\n"
+ "fadd v16.4s, v16.4s, v28.4s\n"
+ "fadd v29.4s, v11.4s, v15.4s\n"
+ "fadd v23.4s, v27.4s, v18.4s\n"
+ "fmla v24.4s, v18.4s, v0.s[1]\n"
+ "fmul v30.4s, v30.4s, v0.s[0]\n"
+ "mov v26.16b, v11.16b\n"
+ "fadd v27.4s, v9.4s, v12.4s\n"
+ "fsub v6.4s, v9.4s, v12.4s\n"
+ "mov v13.16b, v3.16b\n"
+ "fadd v10.4s, v10.4s, v2.4s\n"
+ "fadd v24.4s, v24.4s, v25.4s\n"
+ "fmla v26.4s, v15.4s, v0.s[1]\n"
+ "fadd v8.4s, v8.4s, v30.4s\n"
+ "fmla v14.4s, v30.4s, v0.s[1]\n"
+ "fadd v1.4s, v1.4s, v27.4s\n"
+ "fmul v6.4s, v6.4s, v0.s[0]\n"
+ "str q10, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v5.4s, v27.4s, v0.s[1]\n"
+ "fadd v26.4s, v26.4s, v21.4s\n"
+ "fadd v22.4s, v22.4s, v2.4s\n"
+ "fadd v14.4s, v14.4s, v23.4s\n"
+ "fadd v8.4s, v8.4s, v2.4s\n"
+ "fadd v10.4s, v3.4s, v6.4s\n"
+ "fmla v13.4s, v6.4s, v0.s[1]\n"
+ "fadd v1.4s, v1.4s, v2.4s\n"
+ "fadd v29.4s, v29.4s, v2.4s\n"
+ "str q22, [x17]\n"
+ "fadd v7.4s, v7.4s, v2.4s\n"
+ "str q8, [x17, %[output_col_stride1]]\n"
+ "fadd v4.4s, v4.4s, v2.4s\n"
+ "fadd v13.4s, v13.4s, v24.4s\n"
+ "fadd v10.4s, v10.4s, v2.4s\n"
+ "str q1, [%[outptr0], x16]\n"
+ "fadd v5.4s, v5.4s, v2.4s\n"
+ "str q29, [x17, x15]\n"
+ "fadd v16.4s, v16.4s, v2.4s\n"
+ "str q7, [x28, %[output_col_stride1]]\n"
+ "fadd v14.4s, v14.4s, v2.4s\n"
+ "str q10, [x17, x16]\n"
+ "fadd v26.4s, v26.4s, v2.4s\n"
+ "str q4, [x28, x15]\n"
+ "fadd v13.4s, v13.4s, v2.4s\n"
+ "str q5, [x28, x16]\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "str q16, [x18]\n"
+ "add x17, x17, #16\n"
+ "str q14, [x18, %[output_col_stride1]]\n"
+ "add x28, x28, #16\n"
+ "str q26, [x18, x15]\n"
+ "str q13, [x18, x16]\n"
+ "add x18, x18, #16\n"
+ "4:\n"
+ "cmp x20, #2\n"
+ "blt 5f\n"
+ "ldr d19, [%[inptr0]]\n"
+ "ldr d20, [%[inptr0], %[in_col_stride1]]\n"
+ "sub x20, x20, #2\n"
+ "ldr d4, [%[inptr0], x21]\n"
+ "ldr d17, [%[inptr0], x22]\n"
+ "fadd v1.4s, v20.4s, v4.4s\n"
+ "ldr d22, [%[inptr0], x23]\n"
+ "fadd v5.4s, v17.4s, v22.4s\n"
+ "ldr d18, [%[inptr0], x24]\n"
+ "fsub v7.4s, v20.4s, v4.4s\n"
+ "ldr d25, [x25]\n"
+ "fsub v10.4s, v17.4s, v22.4s\n"
+ "ldr d12, [x25, %[in_col_stride1]]\n"
+ "fadd v8.4s, v19.4s, v1.4s\n"
+ "ldr d23, [x25, x21]\n"
+ "mov v4.16b, v1.16b\n"
+ "ldr d9, [x25, x22]\n"
+ "mov v1.16b, v7.16b\n"
+ "ldr d11, [x25, x23]\n"
+ "fmul v10.4s, v10.4s, v0.s[0]\n"
+ "ldr d6, [x25, x24]\n"
+ "fadd v8.4s, v8.4s, v5.4s\n"
+ "ldr d2, [x13]\n"
+ "fmla v4.4s, v5.4s, v0.s[1]\n"
+ "ldr d21, [x13, %[in_col_stride1]]\n"
+ "fadd v3.4s, v12.4s, v23.4s\n"
+ "ldr d26, [x13, x21]\n"
+ "fadd v7.4s, v7.4s, v10.4s\n"
+ "ldr d17, [x13, x22]\n"
+ "fmla v1.4s, v10.4s, v0.s[1]\n"
+ "ldr d19, [x13, x23]\n"
+ "fadd v27.4s, v9.4s, v11.4s\n"
+ "ldr d10, [x13, x24]\n"
+ "fadd v14.4s, v25.4s, v3.4s\n"
+ "ldr d15, [x26]\n"
+ "fsub v16.4s, v12.4s, v23.4s\n"
+ "ldr d12, [x26, %[in_col_stride1]]\n"
+ "fadd v1.4s, v1.4s, v18.4s\n"
+ "ldr d20, [x26, x21]\n"
+ "fsub v9.4s, v9.4s, v11.4s\n"
+ "ldr d24, [x26, x22]\n"
+ "fadd v14.4s, v14.4s, v27.4s\n"
+ "ldr d29, [x26, x23]\n"
+ "mov v11.16b, v3.16b\n"
+ "ldr d5, [x26, x24]\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr d18, [x14]\n"
+ "fmul v9.4s, v9.4s, v0.s[0]\n"
+ "add %[inptr0], %[inptr0], #8\n"
+ "fmla v11.4s, v27.4s, v0.s[1]\n"
+ "add x25, x25, #8\n"
+ "fadd v23.4s, v21.4s, v26.4s\n"
+ "add x13, x13, #8\n"
+ "fsub v21.4s, v21.4s, v26.4s\n"
+ "ldr d22, [x14, %[in_col_stride1]]\n"
+ "fadd v13.4s, v16.4s, v9.4s\n"
+ "add x26, x26, #8\n"
+ "fmla v3.4s, v9.4s, v0.s[1]\n"
+ "fadd v30.4s, v17.4s, v19.4s\n"
+ "fadd v16.4s, v2.4s, v23.4s\n"
+ "fsub v19.4s, v17.4s, v19.4s\n"
+ "mov v17.16b, v23.16b\n"
+ "fadd v26.4s, v12.4s, v20.4s\n"
+ "fsub v9.4s, v12.4s, v20.4s\n"
+ "ldr d28, [x14, x21]\n"
+ "fadd v3.4s, v3.4s, v6.4s\n"
+ "ldr d20, [x14, x22]\n"
+ "fadd v16.4s, v16.4s, v30.4s\n"
+ "fmul v19.4s, v19.4s, v0.s[0]\n"
+ "fmla v17.4s, v30.4s, v0.s[1]\n"
+ "fadd v25.4s, v24.4s, v29.4s\n"
+ "fadd v23.4s, v15.4s, v26.4s\n"
+ "fsub v12.4s, v24.4s, v29.4s\n"
+ "mov v15.16b, v26.16b\n"
+ "fadd v24.4s, v22.4s, v28.4s\n"
+ "fsub v22.4s, v22.4s, v28.4s\n"
+ "fadd v29.4s, v14.4s, v16.4s\n"
+ "fsub v16.4s, v14.4s, v16.4s\n"
+ "ldr d28, [x14, x23]\n"
+ "fadd v23.4s, v23.4s, v25.4s\n"
+ "fmul v12.4s, v12.4s, v0.s[0]\n"
+ "fmla v15.4s, v25.4s, v0.s[1]\n"
+ "mov v6.16b, v21.16b\n"
+ "fadd v30.4s, v21.4s, v19.4s\n"
+ "fadd v26.4s, v18.4s, v24.4s\n"
+ "mov v25.16b, v24.16b\n"
+ "fadd v18.4s, v8.4s, v29.4s\n"
+ "fmla v6.4s, v19.4s, v0.s[1]\n"
+ "fadd v27.4s, v20.4s, v28.4s\n"
+ "fsub v21.4s, v20.4s, v28.4s\n"
+ "mov v19.16b, v29.16b\n"
+ "fadd v29.4s, v13.4s, v30.4s\n"
+ "fsub v8.4s, v13.4s, v30.4s\n"
+ "fadd v14.4s, v9.4s, v12.4s\n"
+ "fadd v6.4s, v6.4s, v10.4s\n"
+ "ldr d20, [x14, x24]\n"
+ "fadd v26.4s, v26.4s, v27.4s\n"
+ "add x14, x14, #8\n"
+ "fmla v9.4s, v12.4s, v0.s[1]\n"
+ "ldr d24, [x27]\n"
+ "fmul v21.4s, v21.4s, v0.s[0]\n"
+ "fmla v25.4s, v27.4s, v0.s[1]\n"
+ "fadd v10.4s, v7.4s, v29.4s\n"
+ "ldr d2, [%[bptr]]\n"
+ "mov v7.16b, v29.16b\n"
+ "add %[bptr], %[bptr], #8\n"
+ "fadd v9.4s, v9.4s, v5.4s\n"
+ "fadd v13.4s, v23.4s, v26.4s\n"
+ "fsub v23.4s, v23.4s, v26.4s\n"
+ "fadd v27.4s, v11.4s, v17.4s\n"
+ "fsub v11.4s, v11.4s, v17.4s\n"
+ "fadd v30.4s, v15.4s, v25.4s\n"
+ "fsub v15.4s, v15.4s, v25.4s\n"
+ "ldr d28, [x27, %[in_col_stride1]]\n"
+ "fadd v18.4s, v18.4s, v13.4s\n"
+ "fmla v19.4s, v13.4s, v0.s[1]\n"
+ "fadd v26.4s, v22.4s, v21.4s\n"
+ "mov v12.16b, v22.16b\n"
+ "fmul v23.4s, v23.4s, v0.s[0]\n"
+ "fadd v17.4s, v4.4s, v27.4s\n"
+ "fmul v15.4s, v15.4s, v0.s[0]\n"
+ "mov v4.16b, v27.16b\n"
+ "fmla v12.4s, v21.4s, v0.s[1]\n"
+ "ldr d22, [x27, x21]\n"
+ "fadd v18.4s, v18.4s, v2.4s\n"
+ "fadd v19.4s, v19.4s, v2.4s\n"
+ "fadd v17.4s, v17.4s, v30.4s\n"
+ "fmla v4.4s, v30.4s, v0.s[1]\n"
+ "fadd v25.4s, v28.4s, v22.4s\n"
+ "fsub v27.4s, v28.4s, v22.4s\n"
+ "fadd v12.4s, v12.4s, v20.4s\n"
+ "ldr d29, [x27, x22]\n"
+ "str d18, [%[outptr0]]\n"
+ "fadd v22.4s, v16.4s, v23.4s\n"
+ "str d19, [x28]\n"
+ "fadd v28.4s, v24.4s, v25.4s\n"
+ "ldr d30, [x27, x23]\n"
+ "fadd v20.4s, v29.4s, v30.4s\n"
+ "fsub v18.4s, v29.4s, v30.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "ldr d25, [x27, x24]\n"
+ "fmla v16.4s, v23.4s, v0.s[1]\n"
+ "add x27, x27, #8\n"
+ "mov v24.16b, v27.16b\n"
+ "fadd v17.4s, v17.4s, v2.4s\n"
+ "fadd v28.4s, v28.4s, v20.4s\n"
+ "fmul v18.4s, v18.4s, v0.s[0]\n"
+ "fmla v21.4s, v20.4s, v0.s[1]\n"
+ "fadd v13.4s, v14.4s, v26.4s\n"
+ "fsub v30.4s, v14.4s, v26.4s\n"
+ "mov v14.16b, v8.16b\n"
+ "str d17, [%[outptr0], x15]\n"
+ "fadd v29.4s, v11.4s, v15.4s\n"
+ "fadd v23.4s, v27.4s, v18.4s\n"
+ "fmla v24.4s, v18.4s, v0.s[1]\n"
+ "fadd v16.4s, v16.4s, v28.4s\n"
+ "fadd v10.4s, v10.4s, v13.4s\n"
+ "fmul v30.4s, v30.4s, v0.s[0]\n"
+ "fmla v7.4s, v13.4s, v0.s[1]\n"
+ "mov v26.16b, v11.16b\n"
+ "fadd v13.4s, v3.4s, v6.4s\n"
+ "fadd v24.4s, v24.4s, v25.4s\n"
+ "fadd v27.4s, v9.4s, v12.4s\n"
+ "fsub v3.4s, v3.4s, v6.4s\n"
+ "fsub v6.4s, v9.4s, v12.4s\n"
+ "fadd v8.4s, v8.4s, v30.4s\n"
+ "fmla v14.4s, v30.4s, v0.s[1]\n"
+ "fmla v26.4s, v15.4s, v0.s[1]\n"
+ "fadd v1.4s, v1.4s, v13.4s\n"
+ "mov v5.16b, v13.16b\n"
+ "fadd v10.4s, v10.4s, v2.4s\n"
+ "fmul v6.4s, v6.4s, v0.s[0]\n"
+ "mov v13.16b, v3.16b\n"
+ "fadd v14.4s, v14.4s, v23.4s\n"
+ "fadd v22.4s, v22.4s, v2.4s\n"
+ "fadd v26.4s, v26.4s, v21.4s\n"
+ "fadd v1.4s, v1.4s, v27.4s\n"
+ "str d10, [%[outptr0], %[output_col_stride1]]\n"
+ "fmla v5.4s, v27.4s, v0.s[1]\n"
+ "fadd v10.4s, v3.4s, v6.4s\n"
+ "fmla v13.4s, v6.4s, v0.s[1]\n"
+ "str d22, [x17]\n"
+ "fadd v8.4s, v8.4s, v2.4s\n"
+ "fadd v1.4s, v1.4s, v2.4s\n"
+ "fadd v29.4s, v29.4s, v2.4s\n"
+ "fadd v7.4s, v7.4s, v2.4s\n"
+ "fadd v4.4s, v4.4s, v2.4s\n"
+ "fadd v13.4s, v13.4s, v24.4s\n"
+ "fadd v10.4s, v10.4s, v2.4s\n"
+ "str d8, [x17, %[output_col_stride1]]\n"
+ "fadd v5.4s, v5.4s, v2.4s\n"
+ "str d1, [%[outptr0], x16]\n"
+ "fadd v16.4s, v16.4s, v2.4s\n"
+ "str d29, [x17, x15]\n"
+ "fadd v14.4s, v14.4s, v2.4s\n"
+ "str d10, [x17, x16]\n"
+ "fadd v26.4s, v26.4s, v2.4s\n"
+ "str d7, [x28, %[output_col_stride1]]\n"
+ "fadd v13.4s, v13.4s, v2.4s\n"
+ "str d4, [x28, x15]\n"
+ "add %[outptr0], %[outptr0], #8\n"
+ "str d5, [x28, x16]\n"
+ "add x17, x17, #8\n"
+ "str d16, [x18]\n"
+ "add x28, x28, #8\n"
+ "str d14, [x18, %[output_col_stride1]]\n"
+ "str d26, [x18, x15]\n"
+ "str d13, [x18, x16]\n"
+ "add x18, x18, #8\n"
+ "5:\n"
+ "cbz x20, 6f\n"
+ "ldr s19, [%[inptr0]]\n"
+ "ldr s20, [%[inptr0], %[in_col_stride1]]\n"
+ "ldr s4, [%[inptr0], x21]\n"
+ "fadd v1.4s, v20.4s, v4.4s\n"
+ "ldr s17, [%[inptr0], x22]\n"
+ "fsub v7.4s, v20.4s, v4.4s\n"
+ "ldr s22, [%[inptr0], x23]\n"
+ "fadd v5.4s, v17.4s, v22.4s\n"
+ "ldr s18, [%[inptr0], x24]\n"
+ "fsub v10.4s, v17.4s, v22.4s\n"
+ "ldr s25, [x25]\n"
+ "fadd v8.4s, v19.4s, v1.4s\n"
+ "ldr s12, [x25, %[in_col_stride1]]\n"
+ "mov v4.16b, v1.16b\n"
+ "ldr s23, [x25, x21]\n"
+ "mov v1.16b, v7.16b\n"
+ "ldr s9, [x25, x22]\n"
+ "fmul v10.4s, v10.4s, v0.s[0]\n"
+ "ldr s11, [x25, x23]\n"
+ "fadd v8.4s, v8.4s, v5.4s\n"
+ "ldr s6, [x25, x24]\n"
+ "fmla v4.4s, v5.4s, v0.s[1]\n"
+ "ldr s2, [x13]\n"
+ "fadd v3.4s, v12.4s, v23.4s\n"
+ "ldr s21, [x13, %[in_col_stride1]]\n"
+ "fadd v7.4s, v7.4s, v10.4s\n"
+ "ldr s26, [x13, x21]\n"
+ "fmla v1.4s, v10.4s, v0.s[1]\n"
+ "ldr s17, [x13, x22]\n"
+ "fadd v27.4s, v9.4s, v11.4s\n"
+ "ldr s19, [x13, x23]\n"
+ "fadd v14.4s, v25.4s, v3.4s\n"
+ "ldr s10, [x13, x24]\n"
+ "fsub v16.4s, v12.4s, v23.4s\n"
+ "ldr s15, [x26]\n"
+ "fadd v1.4s, v1.4s, v18.4s\n"
+ "ldr s12, [x26, %[in_col_stride1]]\n"
+ "fsub v9.4s, v9.4s, v11.4s\n"
+ "ldr s20, [x26, x21]\n"
+ "fadd v14.4s, v14.4s, v27.4s\n"
+ "ldr s24, [x26, x22]\n"
+ "mov v11.16b, v3.16b\n"
+ "ldr s29, [x26, x23]\n"
+ "mov v3.16b, v16.16b\n"
+ "ldr s5, [x26, x24]\n"
+ "fmul v9.4s, v9.4s, v0.s[0]\n"
+ "ldr s18, [x14]\n"
+ "fmla v11.4s, v27.4s, v0.s[1]\n"
+ "fadd v23.4s, v21.4s, v26.4s\n"
+ "fsub v21.4s, v21.4s, v26.4s\n"
+ "fadd v30.4s, v17.4s, v19.4s\n"
+ "fsub v19.4s, v17.4s, v19.4s\n"
+ "ldr s22, [x14, %[in_col_stride1]]\n"
+ "fadd v13.4s, v16.4s, v9.4s\n"
+ "fmla v3.4s, v9.4s, v0.s[1]\n"
+ "fadd v16.4s, v2.4s, v23.4s\n"
+ "mov v17.16b, v23.16b\n"
+ "fadd v26.4s, v12.4s, v20.4s\n"
+ "fsub v9.4s, v12.4s, v20.4s\n"
+ "fmul v19.4s, v19.4s, v0.s[0]\n"
+ "ldr s28, [x14, x21]\n"
+ "fadd v3.4s, v3.4s, v6.4s\n"
+ "ldr s20, [x14, x22]\n"
+ "fadd v16.4s, v16.4s, v30.4s\n"
+ "fmla v17.4s, v30.4s, v0.s[1]\n"
+ "fadd v25.4s, v24.4s, v29.4s\n"
+ "fadd v23.4s, v15.4s, v26.4s\n"
+ "fsub v12.4s, v24.4s, v29.4s\n"
+ "mov v15.16b, v26.16b\n"
+ "fadd v24.4s, v22.4s, v28.4s\n"
+ "fsub v22.4s, v22.4s, v28.4s\n"
+ "fadd v30.4s, v21.4s, v19.4s\n"
+ "mov v6.16b, v21.16b\n"
+ "fadd v23.4s, v23.4s, v25.4s\n"
+ "fmla v15.4s, v25.4s, v0.s[1]\n"
+ "fmul v12.4s, v12.4s, v0.s[0]\n"
+ "ldr s28, [x14, x23]\n"
+ "fmla v6.4s, v19.4s, v0.s[1]\n"
+ "fadd v27.4s, v20.4s, v28.4s\n"
+ "fadd v26.4s, v18.4s, v24.4s\n"
+ "fsub v21.4s, v20.4s, v28.4s\n"
+ "mov v25.16b, v24.16b\n"
+ "fadd v29.4s, v14.4s, v16.4s\n"
+ "fsub v16.4s, v14.4s, v16.4s\n"
+ "ldr s20, [x14, x24]\n"
+ "fadd v6.4s, v6.4s, v10.4s\n"
+ "ldr s24, [x27]\n"
+ "fadd v26.4s, v26.4s, v27.4s\n"
+ "fmul v21.4s, v21.4s, v0.s[0]\n"
+ "fmla v25.4s, v27.4s, v0.s[1]\n"
+ "fadd v18.4s, v8.4s, v29.4s\n"
+ "mov v19.16b, v29.16b\n"
+ "fadd v29.4s, v13.4s, v30.4s\n"
+ "fsub v8.4s, v13.4s, v30.4s\n"
+ "fadd v27.4s, v11.4s, v17.4s\n"
+ "fsub v11.4s, v11.4s, v17.4s\n"
+ "fadd v13.4s, v23.4s, v26.4s\n"
+ "fsub v23.4s, v23.4s, v26.4s\n"
+ "ldr s28, [x27, %[in_col_stride1]]\n"
+ "fadd v10.4s, v7.4s, v29.4s\n"
+ "mov v7.16b, v29.16b\n"
+ "fadd v17.4s, v4.4s, v27.4s\n"
+ "mov v4.16b, v27.16b\n"
+ "fadd v18.4s, v18.4s, v13.4s\n"
+ "fmla v19.4s, v13.4s, v0.s[1]\n"
+ "fmul v23.4s, v23.4s, v0.s[0]\n"
+ "fadd v30.4s, v15.4s, v25.4s\n"
+ "fsub v15.4s, v15.4s, v25.4s\n"
+ "fadd v13.4s, v3.4s, v6.4s\n"
+ "fsub v3.4s, v3.4s, v6.4s\n"
+ "ldr s2, [%[bptr]]\n"
+ "fadd v18.4s, v18.4s, v2.4s\n"
+ "fadd v19.4s, v19.4s, v2.4s\n"
+ "fadd v17.4s, v17.4s, v30.4s\n"
+ "fmla v4.4s, v30.4s, v0.s[1]\n"
+ "fadd v14.4s, v9.4s, v12.4s\n"
+ "fmul v15.4s, v15.4s, v0.s[0]\n"
+ "fadd v1.4s, v1.4s, v13.4s\n"
+ "str s18, [%[outptr0]]\n"
+ "fadd v26.4s, v22.4s, v21.4s\n"
+ "str s19, [x28]\n"
+ "fmla v9.4s, v12.4s, v0.s[1]\n"
+ "mov v12.16b, v22.16b\n"
+ "ldr s22, [x27, x21]\n"
+ "fadd v25.4s, v28.4s, v22.4s\n"
+ "fsub v27.4s, v28.4s, v22.4s\n"
+ "fadd v22.4s, v16.4s, v23.4s\n"
+ "fadd v9.4s, v9.4s, v5.4s\n"
+ "ldr s29, [x27, x22]\n"
+ "fmla v12.4s, v21.4s, v0.s[1]\n"
+ "ldr s30, [x27, x23]\n"
+ "fadd v28.4s, v24.4s, v25.4s\n"
+ "mov v21.16b, v25.16b\n"
+ "fmla v16.4s, v23.4s, v0.s[1]\n"
+ "ldr s25, [x27, x24]\n"
+ "mov v5.16b, v13.16b\n"
+ "fadd v17.4s, v17.4s, v2.4s\n"
+ "fadd v12.4s, v12.4s, v20.4s\n"
+ "fadd v20.4s, v29.4s, v30.4s\n"
+ "fsub v18.4s, v29.4s, v30.4s\n"
+ "mov v24.16b, v27.16b\n"
+ "fadd v22.4s, v22.4s, v2.4s\n"
+ "fadd v4.4s, v4.4s, v2.4s\n"
+ "str s17, [%[outptr0], x15]\n"
+ "fadd v13.4s, v14.4s, v26.4s\n"
+ "fadd v28.4s, v28.4s, v20.4s\n"
+ "fmla v21.4s, v20.4s, v0.s[1]\n"
+ "fmul v18.4s, v18.4s, v0.s[0]\n"
+ "fsub v30.4s, v14.4s, v26.4s\n"
+ "str s22, [x17]\n"
+ "mov v14.16b, v8.16b\n"
+ "str s4, [x28, x15]\n"
+ "fadd v10.4s, v10.4s, v13.4s\n"
+ "fadd v16.4s, v16.4s, v28.4s\n"
+ "fmla v7.4s, v13.4s, v0.s[1]\n"
+ "fadd v23.4s, v27.4s, v18.4s\n"
+ "fmla v24.4s, v18.4s, v0.s[1]\n"
+ "fmul v30.4s, v30.4s, v0.s[0]\n"
+ "fadd v29.4s, v11.4s, v15.4s\n"
+ "mov v26.16b, v11.16b\n"
+ "fadd v27.4s, v9.4s, v12.4s\n"
+ "fsub v6.4s, v9.4s, v12.4s\n"
+ "mov v13.16b, v3.16b\n"
+ "fadd v24.4s, v24.4s, v25.4s\n"
+ "fadd v10.4s, v10.4s, v2.4s\n"
+ "fadd v8.4s, v8.4s, v30.4s\n"
+ "fmla v14.4s, v30.4s, v0.s[1]\n"
+ "fmla v26.4s, v15.4s, v0.s[1]\n"
+ "fadd v1.4s, v1.4s, v27.4s\n"
+ "fmul v6.4s, v6.4s, v0.s[0]\n"
+ "fmla v5.4s, v27.4s, v0.s[1]\n"
+ "str s10, [%[outptr0], %[output_col_stride1]]\n"
+ "fadd v29.4s, v29.4s, v2.4s\n"
+ "fadd v14.4s, v14.4s, v23.4s\n"
+ "fadd v8.4s, v8.4s, v2.4s\n"
+ "fadd v26.4s, v26.4s, v21.4s\n"
+ "fadd v1.4s, v1.4s, v2.4s\n"
+ "fadd v10.4s, v3.4s, v6.4s\n"
+ "fmla v13.4s, v6.4s, v0.s[1]\n"
+ "str s29, [x17, x15]\n"
+ "fadd v7.4s, v7.4s, v2.4s\n"
+ "str s8, [x17, %[output_col_stride1]]\n"
+ "fadd v5.4s, v5.4s, v2.4s\n"
+ "str s1, [%[outptr0], x16]\n"
+ "fadd v16.4s, v16.4s, v2.4s\n"
+ "fadd v13.4s, v13.4s, v24.4s\n"
+ "fadd v10.4s, v10.4s, v2.4s\n"
+ "str s7, [x28, %[output_col_stride1]]\n"
+ "fadd v14.4s, v14.4s, v2.4s\n"
+ "str s5, [x28, x16]\n"
+ "fadd v26.4s, v26.4s, v2.4s\n"
+ "str s16, [x18]\n"
+ "fadd v13.4s, v13.4s, v2.4s\n"
+ "str s10, [x17, x16]\n"
+ "str s14, [x18, %[output_col_stride1]]\n"
+ "str s26, [x18, x15]\n"
+ "str s13, [x18, x16]\n"
+ "6:\n"
+ : [bptr] "+r" (bptr), [outptr0] "+r" (output), [inptr0] "+r" (inptr)
+ : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [pcoeffs] "r" (coeffs), [n_channels] "r" ((long) n_channels), [in_row_stride] "r" (6 * matrix_stride * sizeof(float)), [in_col_stride1] "r" (matrix_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+ }
+ else
+ {
+ __asm__ __volatile__ (
+ "ldr d0, [%[pcoeffs]]\n"
+ "add x21, %[in_col_stride1], %[in_col_stride1]\n" // Compute input column stride 2
+ "add x22, x21, %[in_col_stride1]\n" // Compute input column stride 3
+ "add x25, %[inptr0], %[in_row_stride]\n" // Compute input row pointers
+ "add x15, %[output_col_stride1], %[output_col_stride1]\n" // Compute output column stride 2
+ "add x23, x22, %[in_col_stride1]\n" // Compute input column stride 4
+ "add x13, x25, %[in_row_stride]\n" // Compute input row pointers
+ "add x16, x15, %[output_col_stride1]\n" // Compute output column stride 3
+ "add x24, x23, %[in_col_stride1]\n" // Compute input column stride 5
+ "add x26, x13, %[in_row_stride]\n" // Compute input row pointers
+ "add x17, %[outptr0], %[output_row_stride]\n" // Compute output row pointer 1
+ "add x14, x26, %[in_row_stride]\n" // Compute input row pointers
+ "add x28, x17, %[output_row_stride]\n" // Compute output row pointer 2
+ "lsr x19, %[n_channels], #2\n"
+ "add x27, x14, %[in_row_stride]\n" // Compute input row pointers
+ "add x18, x28, %[output_row_stride]\n" // Compute output row pointer 3
+ "and x20, %[n_channels], #3\n"
+ "cbz x19, 4f\n"
+ "1:\n" // Quad head
+ "ldr q17, [%[inptr0]]\n"
+ "subs x19, x19, #1\n"
+ "ldr q23, [%[inptr0], %[in_col_stride1]]\n"
+ "ldr q27, [%[inptr0], x21]\n"
+ "fadd v4.4s, v23.4s, v27.4s\n"
+ "ldr q24, [%[inptr0], x22]\n"
+ "fsub v13.4s, v23.4s, v27.4s\n"
+ "ldr q11, [%[inptr0], x23]\n"
+ "fadd v10.4s, v24.4s, v11.4s\n"
+ "ldr q12, [%[inptr0], x24]\n"
+ "fsub v11.4s, v24.4s, v11.4s\n"
+ "ldr q20, [x25]\n"
+ "fadd v7.4s, v17.4s, v4.4s\n"
+ "ldr q19, [x25, %[in_col_stride1]]\n"
+ "mov v4.16b, v4.16b\n"
+ "ldr q22, [x25, x21]\n"
+ "mov v1.16b, v13.16b\n"
+ "ldr q14, [x25, x22]\n"
+ "fmul v11.4s, v11.4s, v0.s[0]\n"
+ "ldr q18, [x25, x23]\n"
+ "fadd v7.4s, v7.4s, v10.4s\n"
+ "ldr q3, [x25, x24]\n"
+ "fmla v4.4s, v10.4s, v0.s[1]\n"
+ "fadd v8.4s, v13.4s, v11.4s\n"
+ "fmla v1.4s, v11.4s, v0.s[1]\n"
+ "fadd v1.4s, v1.4s, v12.4s\n"
+ "beq 3f\n"
+ "2:\n" // Quad loop
+ "fadd v2.4s, v19.4s, v22.4s\n"
+ "ldr q16, [x13]\n"
+ "fadd v23.4s, v14.4s, v18.4s\n"
+ "ldr q21, [x13, %[in_col_stride1]]\n"
+ "fsub v15.4s, v19.4s, v22.4s\n"
+ "ldr q24, [x13, x21]\n"
+ "fsub v31.4s, v14.4s, v18.4s\n"
+ "ldr q25, [x13, x22]\n"
+ "fadd v11.4s, v20.4s, v2.4s\n"
+ "ldr q17, [x13, x23]\n"
+ "mov v13.16b, v2.16b\n"
+ "ldr q9, [x13, x24]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr q6, [x26]\n"
+ "fmul v31.4s, v31.4s, v0.s[0]\n"
+ "ldr q19, [x26, %[in_col_stride1]]\n"
+ "fadd v11.4s, v11.4s, v23.4s\n"
+ "ldr q22, [x26, x21]\n"
+ "fmla v13.4s, v23.4s, v0.s[1]\n"
+ "ldr q12, [x26, x22]\n"
+ "fadd v29.4s, v21.4s, v24.4s\n"
+ "ldr q26, [x26, x23]\n"
+ "fadd v15.4s, v15.4s, v31.4s\n"
+ "ldr q5, [x26, x24]\n"
+ "fmla v2.4s, v31.4s, v0.s[1]\n"
+ "ldr q10, [x14]\n"
+ "fadd v18.4s, v25.4s, v17.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fadd v27.4s, v16.4s, v29.4s\n"
+ "add x25, x25, #16\n"
+ "fsub v14.4s, v21.4s, v24.4s\n"
+ "ldr q30, [x14, %[in_col_stride1]]\n"
+ "fadd v2.4s, v2.4s, v3.4s\n"
+ "ldr q31, [x14, x21]\n"
+ "fsub v28.4s, v25.4s, v17.4s\n"
+ "add x13, x13, #16\n"
+ "fadd v27.4s, v27.4s, v18.4s\n"
+ "add x26, x26, #16\n"
+ "mov v21.16b, v29.16b\n"
+ "subs x19, x19, #1\n"
+ "fadd v20.4s, v19.4s, v22.4s\n"
+ "fsub v17.4s, v19.4s, v22.4s\n"
+ "fmul v28.4s, v28.4s, v0.s[0]\n"
+ "ldr q23, [x14, x22]\n"
+ "fmla v21.4s, v18.4s, v0.s[1]\n"
+ "fadd v29.4s, v12.4s, v26.4s\n"
+ "fsub v16.4s, v12.4s, v26.4s\n"
+ "fadd v25.4s, v30.4s, v31.4s\n"
+ "fadd v24.4s, v6.4s, v20.4s\n"
+ "mov v6.16b, v20.16b\n"
+ "fsub v22.4s, v30.4s, v31.4s\n"
+ "fadd v31.4s, v11.4s, v27.4s\n"
+ "fsub v12.4s, v11.4s, v27.4s\n"
+ "ldr q26, [x14, x23]\n"
+ "fmul v16.4s, v16.4s, v0.s[0]\n"
+ "fmla v6.4s, v29.4s, v0.s[1]\n"
+ "fadd v24.4s, v24.4s, v29.4s\n"
+ "mov v3.16b, v14.16b\n"
+ "fadd v20.4s, v14.4s, v28.4s\n"
+ "fadd v29.4s, v10.4s, v25.4s\n"
+ "mov v10.16b, v25.16b\n"
+ "fadd v25.4s, v7.4s, v31.4s\n"
+ "fmla v3.4s, v28.4s, v0.s[1]\n"
+ "fadd v14.4s, v23.4s, v26.4s\n"
+ "fsub v23.4s, v23.4s, v26.4s\n"
+ "mov v26.16b, v31.16b\n"
+ "fadd v31.4s, v15.4s, v20.4s\n"
+ "fsub v11.4s, v15.4s, v20.4s\n"
+ "fadd v20.4s, v17.4s, v16.4s\n"
+ "mov v7.16b, v17.16b\n"
+ "fadd v3.4s, v3.4s, v9.4s\n"
+ "ldr q18, [x14, x24]\n"
+ "fadd v29.4s, v29.4s, v14.4s\n"
+ "add x14, x14, #16\n"
+ "fmla v7.4s, v16.4s, v0.s[1]\n"
+ "ldr q19, [x27]\n"
+ "fmul v23.4s, v23.4s, v0.s[0]\n"
+ "fmla v10.4s, v14.4s, v0.s[1]\n"
+ "fadd v15.4s, v8.4s, v31.4s\n"
+ "mov v14.16b, v31.16b\n"
+ "fadd v28.4s, v24.4s, v29.4s\n"
+ "fsub v24.4s, v24.4s, v29.4s\n"
+ "fadd v7.4s, v7.4s, v5.4s\n"
+ "ldr q27, [x27, %[in_col_stride1]]\n"
+ "fadd v30.4s, v13.4s, v21.4s\n"
+ "fsub v9.4s, v13.4s, v21.4s\n"
+ "fadd v17.4s, v22.4s, v23.4s\n"
+ "mov v8.16b, v22.16b\n"
+ "fadd v25.4s, v25.4s, v28.4s\n"
+ "fmul v24.4s, v24.4s, v0.s[0]\n"
+ "fmla v26.4s, v28.4s, v0.s[1]\n"
+ "ldr q29, [x27, x21]\n"
+ "fmla v8.4s, v23.4s, v0.s[1]\n"
+ "ldr q28, [x27, x22]\n"
+ "fadd v13.4s, v4.4s, v30.4s\n"
+ "mov v4.16b, v30.16b\n"
+ "str q25, [%[outptr0]]\n" // Store output (0, 0)
+ "fadd v16.4s, v27.4s, v29.4s\n"
+ "str q26, [x28]\n" // Store output (2, 0)
+ "fsub v29.4s, v27.4s, v29.4s\n"
+ "fadd v8.4s, v8.4s, v18.4s\n"
+ "ldr q23, [x27, x23]\n"
+ "fadd v30.4s, v28.4s, v23.4s\n"
+ "ldr q25, [x27, x24]\n"
+ "fadd v19.4s, v19.4s, v16.4s\n"
+ "add x27, x27, #16\n"
+ "fsub v27.4s, v28.4s, v23.4s\n"
+ "mov v16.16b, v16.16b\n"
+ "fadd v22.4s, v20.4s, v17.4s\n"
+ "fsub v20.4s, v20.4s, v17.4s\n"
+ "fadd v21.4s, v12.4s, v24.4s\n"
+ "mov v26.16b, v12.16b\n"
+ "fadd v19.4s, v19.4s, v30.4s\n"
+ "fmla v16.4s, v30.4s, v0.s[1]\n"
+ "fmul v27.4s, v27.4s, v0.s[0]\n"
+ "ldr q17, [%[inptr0]]\n"
+ "fmla v26.4s, v24.4s, v0.s[1]\n"
+ "ldr q23, [%[inptr0], %[in_col_stride1]]\n"
+ "str q21, [x17]\n" // Store output (1, 0)
+ "mov v5.16b, v29.16b\n"
+ "fadd v15.4s, v15.4s, v22.4s\n"
+ "fmul v20.4s, v20.4s, v0.s[0]\n"
+ "fadd v18.4s, v29.4s, v27.4s\n"
+ "fmla v14.4s, v22.4s, v0.s[1]\n"
+ "fmla v5.4s, v27.4s, v0.s[1]\n"
+ "ldr q27, [%[inptr0], x21]\n"
+ "fadd v26.4s, v26.4s, v19.4s\n"
+ "ldr q24, [%[inptr0], x22]\n"
+ "str q15, [%[outptr0], %[output_col_stride1]]\n" // Store output (0, 1)
+ "fadd v12.4s, v11.4s, v20.4s\n"
+ "str q14, [x28, %[output_col_stride1]]\n" // Store output (2, 1)
+ "mov v28.16b, v11.16b\n"
+ "fadd v5.4s, v5.4s, v25.4s\n"
+ "ldr q11, [%[inptr0], x23]\n"
+ "str q26, [x18]\n" // Store output (3, 0)
+ "fadd v21.4s, v6.4s, v10.4s\n"
+ "str q12, [x17, %[output_col_stride1]]\n" // Store output (1, 1)
+ "fmla v28.4s, v20.4s, v0.s[1]\n"
+ "fsub v10.4s, v6.4s, v10.4s\n"
+ "ldr q12, [%[inptr0], x24]\n"
+ "mov v15.16b, v9.16b\n"
+ "ldr q20, [x25]\n"
+ "fadd v13.4s, v13.4s, v21.4s\n"
+ "ldr q19, [x25, %[in_col_stride1]]\n"
+ "fadd v28.4s, v28.4s, v18.4s\n"
+ "ldr q22, [x25, x21]\n"
+ "fmul v10.4s, v10.4s, v0.s[0]\n"
+ "ldr q14, [x25, x22]\n"
+ "fmla v4.4s, v21.4s, v0.s[1]\n"
+ "ldr q18, [x25, x23]\n"
+ "str q13, [%[outptr0], x15]\n" // Store output (0, 2)
+ "fadd v6.4s, v2.4s, v3.4s\n"
+ "str q28, [x18, %[output_col_stride1]]\n" // Store output (3, 1)
+ "fadd v30.4s, v7.4s, v8.4s\n"
+ "fadd v13.4s, v9.4s, v10.4s\n"
+ "fmla v15.4s, v10.4s, v0.s[1]\n"
+ "str q4, [x28, x15]\n" // Store output (2, 2)
+ "fsub v2.4s, v2.4s, v3.4s\n"
+ "fadd v1.4s, v1.4s, v6.4s\n"
+ "ldr q3, [x25, x24]\n"
+ "fsub v8.4s, v7.4s, v8.4s\n"
+ "mov v6.16b, v6.16b\n"
+ "str q13, [x17, x15]\n" // Store output (1, 2)
+ "fadd v15.4s, v15.4s, v16.4s\n"
+ "mov v9.16b, v2.16b\n"
+ "fadd v4.4s, v23.4s, v27.4s\n"
+ "fadd v1.4s, v1.4s, v30.4s\n"
+ "fmla v6.4s, v30.4s, v0.s[1]\n"
+ "fmul v8.4s, v8.4s, v0.s[0]\n"
+ "fadd v10.4s, v24.4s, v11.4s\n"
+ "str q15, [x18, x15]\n" // Store output (3, 2)
+ "fsub v13.4s, v23.4s, v27.4s\n"
+ "fadd v7.4s, v17.4s, v4.4s\n"
+ "fsub v11.4s, v24.4s, v11.4s\n"
+ "str q1, [%[outptr0], x16]\n" // Store output (0, 3)
+ "mov v4.16b, v4.16b\n"
+ "str q6, [x28, x16]\n" // Store output (2, 3)
+ "fadd v2.4s, v2.4s, v8.4s\n"
+ "fmla v9.4s, v8.4s, v0.s[1]\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "fadd v7.4s, v7.4s, v10.4s\n"
+ "add x28, x28, #16\n"
+ "fmul v11.4s, v11.4s, v0.s[0]\n"
+ "fmla v4.4s, v10.4s, v0.s[1]\n"
+ "str q2, [x17, x16]\n" // Store output (1, 3)
+ "mov v1.16b, v13.16b\n"
+ "fadd v9.4s, v9.4s, v5.4s\n"
+ "add x17, x17, #16\n"
+ "fadd v8.4s, v13.4s, v11.4s\n"
+ "fmla v1.4s, v11.4s, v0.s[1]\n"
+ "str q9, [x18, x16]\n" // Store output (3, 3)
+ "add x18, x18, #16\n"
+ "fadd v1.4s, v1.4s, v12.4s\n"
+ "bne 2b\n"
+ "3:\n" // Quad tail
+ "fadd v2.4s, v19.4s, v22.4s\n"
+ "ldr q16, [x13]\n"
+ "fadd v23.4s, v14.4s, v18.4s\n"
+ "ldr q21, [x13, %[in_col_stride1]]\n"
+ "fsub v15.4s, v19.4s, v22.4s\n"
+ "ldr q24, [x13, x21]\n"
+ "fsub v31.4s, v14.4s, v18.4s\n"
+ "ldr q25, [x13, x22]\n"
+ "fadd v11.4s, v20.4s, v2.4s\n"
+ "ldr q17, [x13, x23]\n"
+ "mov v13.16b, v2.16b\n"
+ "ldr q9, [x13, x24]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr q6, [x26]\n"
+ "fmul v31.4s, v31.4s, v0.s[0]\n"
+ "ldr q19, [x26, %[in_col_stride1]]\n"
+ "fadd v11.4s, v11.4s, v23.4s\n"
+ "ldr q22, [x26, x21]\n"
+ "fmla v13.4s, v23.4s, v0.s[1]\n"
+ "ldr q12, [x26, x22]\n"
+ "fadd v29.4s, v21.4s, v24.4s\n"
+ "ldr q26, [x26, x23]\n"
+ "fadd v15.4s, v15.4s, v31.4s\n"
+ "ldr q5, [x26, x24]\n"
+ "fmla v2.4s, v31.4s, v0.s[1]\n"
+ "ldr q10, [x14]\n"
+ "fadd v18.4s, v25.4s, v17.4s\n"
+ "add %[inptr0], %[inptr0], #16\n"
+ "fadd v27.4s, v16.4s, v29.4s\n"
+ "add x25, x25, #16\n"
+ "fsub v14.4s, v21.4s, v24.4s\n"
+ "ldr q30, [x14, %[in_col_stride1]]\n"
+ "fadd v2.4s, v2.4s, v3.4s\n"
+ "ldr q31, [x14, x21]\n"
+ "fsub v28.4s, v25.4s, v17.4s\n"
+ "add x13, x13, #16\n"
+ "fadd v27.4s, v27.4s, v18.4s\n"
+ "add x26, x26, #16\n"
+ "mov v21.16b, v29.16b\n"
+ "fadd v20.4s, v19.4s, v22.4s\n"
+ "fsub v17.4s, v19.4s, v22.4s\n"
+ "fadd v29.4s, v12.4s, v26.4s\n"
+ "fmul v28.4s, v28.4s, v0.s[0]\n"
+ "fsub v16.4s, v12.4s, v26.4s\n"
+ "fmla v21.4s, v18.4s, v0.s[1]\n"
+ "ldr q23, [x14, x22]\n"
+ "fadd v24.4s, v6.4s, v20.4s\n"
+ "mov v6.16b, v20.16b\n"
+ "fadd v25.4s, v30.4s, v31.4s\n"
+ "fsub v22.4s, v30.4s, v31.4s\n"
+ "fadd v20.4s, v14.4s, v28.4s\n"
+ "mov v3.16b, v14.16b\n"
+ "fmul v16.4s, v16.4s, v0.s[0]\n"
+ "fmla v6.4s, v29.4s, v0.s[1]\n"
+ "fadd v24.4s, v24.4s, v29.4s\n"
+ "ldr q26, [x14, x23]\n"
+ "fmla v3.4s, v28.4s, v0.s[1]\n"
+ "fadd v14.4s, v23.4s, v26.4s\n"
+ "fadd v29.4s, v10.4s, v25.4s\n"
+ "fsub v23.4s, v23.4s, v26.4s\n"
+ "mov v10.16b, v25.16b\n"
+ "fadd v31.4s, v11.4s, v27.4s\n"
+ "fsub v12.4s, v11.4s, v27.4s\n"
+ "ldr q18, [x14, x24]\n"
+ "fadd v3.4s, v3.4s, v9.4s\n"
+ "ldr q19, [x27]\n"
+ "fadd v29.4s, v29.4s, v14.4s\n"
+ "add x14, x14, #16\n"
+ "fmul v23.4s, v23.4s, v0.s[0]\n"
+ "fmla v10.4s, v14.4s, v0.s[1]\n"
+ "fadd v25.4s, v7.4s, v31.4s\n"
+ "mov v26.16b, v31.16b\n"
+ "fadd v31.4s, v15.4s, v20.4s\n"
+ "fsub v11.4s, v15.4s, v20.4s\n"
+ "fadd v28.4s, v24.4s, v29.4s\n"
+ "fsub v24.4s, v24.4s, v29.4s\n"
+ "fadd v30.4s, v13.4s, v21.4s\n"
+ "fsub v9.4s, v13.4s, v21.4s\n"
+ "fadd v20.4s, v17.4s, v16.4s\n"
+ "mov v7.16b, v17.16b\n"
+ "fadd v15.4s, v8.4s, v31.4s\n"
+ "mov v14.16b, v31.16b\n"
+ "fadd v25.4s, v25.4s, v28.4s\n"
+ "fmul v24.4s, v24.4s, v0.s[0]\n"
+ "fmla v7.4s, v16.4s, v0.s[1]\n"
+ "ldr q27, [x27, %[in_col_stride1]]\n"
+ "fmla v26.4s, v28.4s, v0.s[1]\n"
+ "ldr q29, [x27, x21]\n"
+ "fadd v13.4s, v4.4s, v30.4s\n"
+ "mov v4.16b, v30.16b\n"
+ "str q25, [%[outptr0]]\n" // Store output (0, 0)
+ "fadd v17.4s, v22.4s, v23.4s\n"
+ "fadd v7.4s, v7.4s, v5.4s\n"
+ "ldr q28, [x27, x22]\n"
+ "str q26, [x28]\n" // Store output (2, 0)
+ "mov v8.16b, v22.16b\n"
+ "fadd v16.4s, v27.4s, v29.4s\n"
+ "fsub v29.4s, v27.4s, v29.4s\n"
+ "fadd v21.4s, v12.4s, v24.4s\n"
+ "mov v26.16b, v12.16b\n"
+ "fmla v8.4s, v23.4s, v0.s[1]\n"
+ "fadd v22.4s, v20.4s, v17.4s\n"
+ "fsub v20.4s, v20.4s, v17.4s\n"
+ "ldr q23, [x27, x23]\n"
+ "fadd v19.4s, v19.4s, v16.4s\n"
+ "mov v16.16b, v16.16b\n"
+ "str q21, [x17]\n" // Store output (1, 0)
+ "fadd v30.4s, v28.4s, v23.4s\n"
+ "fadd v8.4s, v8.4s, v18.4s\n"
+ "ldr q25, [x27, x24]\n"
+ "fsub v27.4s, v28.4s, v23.4s\n"
+ "add x27, x27, #16\n"
+ "mov v5.16b, v29.16b\n"
+ "fmla v26.4s, v24.4s, v0.s[1]\n"
+ "fadd v19.4s, v19.4s, v30.4s\n"
+ "fmla v16.4s, v30.4s, v0.s[1]\n"
+ "fadd v15.4s, v15.4s, v22.4s\n"
+ "fmul v20.4s, v20.4s, v0.s[0]\n"
+ "fmul v27.4s, v27.4s, v0.s[0]\n"
+ "fmla v14.4s, v22.4s, v0.s[1]\n"
+ "mov v28.16b, v11.16b\n"
+ "fadd v21.4s, v6.4s, v10.4s\n"
+ "fadd v26.4s, v26.4s, v19.4s\n"
+ "fsub v10.4s, v6.4s, v10.4s\n"
+ "str q15, [%[outptr0], %[output_col_stride1]]\n" // Store output (0, 1)
+ "fadd v12.4s, v11.4s, v20.4s\n"
+ "str q14, [x28, %[output_col_stride1]]\n" // Store output (2, 1)
+ "fadd v18.4s, v29.4s, v27.4s\n"
+ "fmla v5.4s, v27.4s, v0.s[1]\n"
+ "fmla v28.4s, v20.4s, v0.s[1]\n"
+ "str q26, [x18]\n" // Store output (3, 0)
+ "fadd v13.4s, v13.4s, v21.4s\n"
+ "str q12, [x17, %[output_col_stride1]]\n" // Store output (1, 1)
+ "fmul v10.4s, v10.4s, v0.s[0]\n"
+ "fmla v4.4s, v21.4s, v0.s[1]\n"
+ "mov v15.16b, v9.16b\n"
+ "fadd v5.4s, v5.4s, v25.4s\n"
+ "fadd v28.4s, v28.4s, v18.4s\n"
+ "str q13, [%[outptr0], x15]\n" // Store output (0, 2)
+ "fadd v6.4s, v2.4s, v3.4s\n"
+ "fadd v13.4s, v9.4s, v10.4s\n"
+ "fmla v15.4s, v10.4s, v0.s[1]\n"
+ "str q4, [x28, x15]\n" // Store output (2, 2)
+ "fadd v30.4s, v7.4s, v8.4s\n"
+ "str q28, [x18, %[output_col_stride1]]\n" // Store output (3, 1)
+ "fsub v2.4s, v2.4s, v3.4s\n"
+ "fadd v1.4s, v1.4s, v6.4s\n"
+ "fsub v8.4s, v7.4s, v8.4s\n"
+ "str q13, [x17, x15]\n" // Store output (1, 2)
+ "fadd v15.4s, v15.4s, v16.4s\n"
+ "mov v6.16b, v6.16b\n"
+ "mov v9.16b, v2.16b\n"
+ "fadd v1.4s, v1.4s, v30.4s\n"
+ "fmul v8.4s, v8.4s, v0.s[0]\n"
+ "str q15, [x18, x15]\n" // Store output (3, 2)
+ "fmla v6.4s, v30.4s, v0.s[1]\n"
+ "str q1, [%[outptr0], x16]\n" // Store output (0, 3)
+ "fadd v2.4s, v2.4s, v8.4s\n"
+ "str q6, [x28, x16]\n" // Store output (2, 3)
+ "fmla v9.4s, v8.4s, v0.s[1]\n"
+ "add %[outptr0], %[outptr0], #16\n"
+ "add x28, x28, #16\n"
+ "str q2, [x17, x16]\n" // Store output (1, 3)
+ "fadd v9.4s, v9.4s, v5.4s\n"
+ "add x17, x17, #16\n"
+ "str q9, [x18, x16]\n" // Store output (3, 3)
+ "add x18, x18, #16\n"
+ "4:\n" // Double
+ "cmp x20, #2\n"
+ "blt 5f\n"
+ "ldr d17, [%[inptr0]]\n"
+ "ldr d23, [%[inptr0], %[in_col_stride1]]\n"
+ "sub x20, x20, #2\n"
+ "ldr d27, [%[inptr0], x21]\n"
+ "ldr d24, [%[inptr0], x22]\n"
+ "fadd v4.4s, v23.4s, v27.4s\n"
+ "ldr d11, [%[inptr0], x23]\n"
+ "fadd v10.4s, v24.4s, v11.4s\n"
+ "ldr d12, [%[inptr0], x24]\n"
+ "fsub v13.4s, v23.4s, v27.4s\n"
+ "ldr d20, [x25]\n"
+ "fsub v11.4s, v24.4s, v11.4s\n"
+ "ldr d19, [x25, %[in_col_stride1]]\n"
+ "fadd v7.4s, v17.4s, v4.4s\n"
+ "ldr d22, [x25, x21]\n"
+ "mov v4.16b, v4.16b\n"
+ "ldr d14, [x25, x22]\n"
+ "mov v1.16b, v13.16b\n"
+ "ldr d18, [x25, x23]\n"
+ "fmul v11.4s, v11.4s, v0.s[0]\n"
+ "ldr d3, [x25, x24]\n"
+ "fadd v7.4s, v7.4s, v10.4s\n"
+ "ldr d16, [x13]\n"
+ "fmla v4.4s, v10.4s, v0.s[1]\n"
+ "ldr d21, [x13, %[in_col_stride1]]\n"
+ "fadd v2.4s, v19.4s, v22.4s\n"
+ "ldr d24, [x13, x21]\n"
+ "fadd v8.4s, v13.4s, v11.4s\n"
+ "ldr d25, [x13, x22]\n"
+ "fmla v1.4s, v11.4s, v0.s[1]\n"
+ "ldr d17, [x13, x23]\n"
+ "fadd v23.4s, v14.4s, v18.4s\n"
+ "ldr d9, [x13, x24]\n"
+ "fadd v11.4s, v20.4s, v2.4s\n"
+ "ldr d6, [x26]\n"
+ "fsub v15.4s, v19.4s, v22.4s\n"
+ "ldr d19, [x26, %[in_col_stride1]]\n"
+ "fadd v1.4s, v1.4s, v12.4s\n"
+ "ldr d22, [x26, x21]\n"
+ "fsub v31.4s, v14.4s, v18.4s\n"
+ "ldr d12, [x26, x22]\n"
+ "fadd v11.4s, v11.4s, v23.4s\n"
+ "ldr d26, [x26, x23]\n"
+ "mov v13.16b, v2.16b\n"
+ "ldr d5, [x26, x24]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr d10, [x14]\n"
+ "fmul v31.4s, v31.4s, v0.s[0]\n"
+ "add %[inptr0], %[inptr0], #8\n"
+ "fmla v13.4s, v23.4s, v0.s[1]\n"
+ "add x25, x25, #8\n"
+ "fadd v29.4s, v21.4s, v24.4s\n"
+ "add x13, x13, #8\n"
+ "fsub v14.4s, v21.4s, v24.4s\n"
+ "ldr d30, [x14, %[in_col_stride1]]\n"
+ "fadd v15.4s, v15.4s, v31.4s\n"
+ "add x26, x26, #8\n"
+ "fmla v2.4s, v31.4s, v0.s[1]\n"
+ "fadd v18.4s, v25.4s, v17.4s\n"
+ "fadd v27.4s, v16.4s, v29.4s\n"
+ "fsub v28.4s, v25.4s, v17.4s\n"
+ "mov v21.16b, v29.16b\n"
+ "fadd v20.4s, v19.4s, v22.4s\n"
+ "fsub v17.4s, v19.4s, v22.4s\n"
+ "ldr d31, [x14, x21]\n"
+ "fadd v2.4s, v2.4s, v3.4s\n"
+ "ldr d23, [x14, x22]\n"
+ "fadd v27.4s, v27.4s, v18.4s\n"
+ "fmul v28.4s, v28.4s, v0.s[0]\n"
+ "fmla v21.4s, v18.4s, v0.s[1]\n"
+ "fadd v29.4s, v12.4s, v26.4s\n"
+ "fadd v24.4s, v6.4s, v20.4s\n"
+ "fsub v16.4s, v12.4s, v26.4s\n"
+ "mov v6.16b, v20.16b\n"
+ "fadd v25.4s, v30.4s, v31.4s\n"
+ "fsub v22.4s, v30.4s, v31.4s\n"
+ "fadd v31.4s, v11.4s, v27.4s\n"
+ "fsub v12.4s, v11.4s, v27.4s\n"
+ "ldr d26, [x14, x23]\n"
+ "fadd v24.4s, v24.4s, v29.4s\n"
+ "fmul v16.4s, v16.4s, v0.s[0]\n"
+ "fmla v6.4s, v29.4s, v0.s[1]\n"
+ "mov v3.16b, v14.16b\n"
+ "fadd v20.4s, v14.4s, v28.4s\n"
+ "fadd v29.4s, v10.4s, v25.4s\n"
+ "mov v10.16b, v25.16b\n"
+ "fadd v25.4s, v7.4s, v31.4s\n"
+ "fmla v3.4s, v28.4s, v0.s[1]\n"
+ "fadd v14.4s, v23.4s, v26.4s\n"
+ "fsub v23.4s, v23.4s, v26.4s\n"
+ "mov v26.16b, v31.16b\n"
+ "fadd v31.4s, v15.4s, v20.4s\n"
+ "fsub v11.4s, v15.4s, v20.4s\n"
+ "fadd v20.4s, v17.4s, v16.4s\n"
+ "mov v7.16b, v17.16b\n"
+ "fadd v3.4s, v3.4s, v9.4s\n"
+ "ldr d18, [x14, x24]\n"
+ "fadd v29.4s, v29.4s, v14.4s\n"
+ "add x14, x14, #8\n"
+ "fmla v7.4s, v16.4s, v0.s[1]\n"
+ "ldr d19, [x27]\n"
+ "fmul v23.4s, v23.4s, v0.s[0]\n"
+ "fmla v10.4s, v14.4s, v0.s[1]\n"
+ "fadd v15.4s, v8.4s, v31.4s\n"
+ "mov v14.16b, v31.16b\n"
+ "fadd v28.4s, v24.4s, v29.4s\n"
+ "fsub v24.4s, v24.4s, v29.4s\n"
+ "fadd v7.4s, v7.4s, v5.4s\n"
+ "ldr d27, [x27, %[in_col_stride1]]\n"
+ "fadd v30.4s, v13.4s, v21.4s\n"
+ "fsub v9.4s, v13.4s, v21.4s\n"
+ "fadd v17.4s, v22.4s, v23.4s\n"
+ "mov v8.16b, v22.16b\n"
+ "fadd v25.4s, v25.4s, v28.4s\n"
+ "fmul v24.4s, v24.4s, v0.s[0]\n"
+ "fmla v26.4s, v28.4s, v0.s[1]\n"
+ "ldr d29, [x27, x21]\n"
+ "fmla v8.4s, v23.4s, v0.s[1]\n"
+ "ldr d28, [x27, x22]\n"
+ "fadd v13.4s, v4.4s, v30.4s\n"
+ "mov v4.16b, v30.16b\n"
+ "str d25, [%[outptr0]]\n" // Store output (0, 0)
+ "fadd v16.4s, v27.4s, v29.4s\n"
+ "str d26, [x28]\n" // Store output (2, 0)
+ "fsub v29.4s, v27.4s, v29.4s\n"
+ "fadd v8.4s, v8.4s, v18.4s\n"
+ "ldr d23, [x27, x23]\n"
+ "fadd v30.4s, v28.4s, v23.4s\n"
+ "ldr d25, [x27, x24]\n"
+ "fadd v19.4s, v19.4s, v16.4s\n"
+ "add x27, x27, #8\n"
+ "fsub v27.4s, v28.4s, v23.4s\n"
+ "mov v16.16b, v16.16b\n"
+ "fadd v22.4s, v20.4s, v17.4s\n"
+ "fsub v20.4s, v20.4s, v17.4s\n"
+ "fadd v21.4s, v12.4s, v24.4s\n"
+ "mov v26.16b, v12.16b\n"
+ "fadd v19.4s, v19.4s, v30.4s\n"
+ "fmla v16.4s, v30.4s, v0.s[1]\n"
+ "fmul v27.4s, v27.4s, v0.s[0]\n"
+ "mov v5.16b, v29.16b\n"
+ "fmla v26.4s, v24.4s, v0.s[1]\n"
+ "fadd v15.4s, v15.4s, v22.4s\n"
+ "str d21, [x17]\n" // Store output (1, 0)
+ "fmul v20.4s, v20.4s, v0.s[0]\n"
+ "fmla v14.4s, v22.4s, v0.s[1]\n"
+ "mov v28.16b, v11.16b\n"
+ "fadd v18.4s, v29.4s, v27.4s\n"
+ "fmla v5.4s, v27.4s, v0.s[1]\n"
+ "str d15, [%[outptr0], %[output_col_stride1]]\n" // Store output (0, 1)
+ "fadd v26.4s, v26.4s, v19.4s\n"
+ "fadd v12.4s, v11.4s, v20.4s\n"
+ "fmla v28.4s, v20.4s, v0.s[1]\n"
+ "str d14, [x28, %[output_col_stride1]]\n" // Store output (2, 1)
+ "fadd v21.4s, v6.4s, v10.4s\n"
+ "fadd v5.4s, v5.4s, v25.4s\n"
+ "fsub v10.4s, v6.4s, v10.4s\n"
+ "str d26, [x18]\n" // Store output (3, 0)
+ "mov v15.16b, v9.16b\n"
+ "str d12, [x17, %[output_col_stride1]]\n" // Store output (1, 1)
+ "fadd v28.4s, v28.4s, v18.4s\n"
+ "fadd v13.4s, v13.4s, v21.4s\n"
+ "fmla v4.4s, v21.4s, v0.s[1]\n"
+ "fmul v10.4s, v10.4s, v0.s[0]\n"
+ "fadd v6.4s, v2.4s, v3.4s\n"
+ "fadd v30.4s, v7.4s, v8.4s\n"
+ "fsub v2.4s, v2.4s, v3.4s\n"
+ "str d28, [x18, %[output_col_stride1]]\n" // Store output (3, 1)
+ "fsub v8.4s, v7.4s, v8.4s\n"
+ "str d13, [%[outptr0], x15]\n" // Store output (0, 2)
+ "str d4, [x28, x15]\n" // Store output (2, 2)
+ "fadd v13.4s, v9.4s, v10.4s\n"
+ "fmla v15.4s, v10.4s, v0.s[1]\n"
+ "fadd v1.4s, v1.4s, v6.4s\n"
+ "mov v6.16b, v6.16b\n"
+ "fmul v8.4s, v8.4s, v0.s[0]\n"
+ "mov v9.16b, v2.16b\n"
+ "str d13, [x17, x15]\n" // Store output (1, 2)
+ "fadd v15.4s, v15.4s, v16.4s\n"
+ "fadd v1.4s, v1.4s, v30.4s\n"
+ "fmla v6.4s, v30.4s, v0.s[1]\n"
+ "fadd v2.4s, v2.4s, v8.4s\n"
+ "fmla v9.4s, v8.4s, v0.s[1]\n"
+ "str d15, [x18, x15]\n" // Store output (3, 2)
+ "str d1, [%[outptr0], x16]\n" // Store output (0, 3)
+ "str d2, [x17, x16]\n" // Store output (1, 3)
+ "fadd v9.4s, v9.4s, v5.4s\n"
+ "str d6, [x28, x16]\n" // Store output (2, 3)
+ "add %[outptr0], %[outptr0], #8\n"
+ "add x17, x17, #8\n"
+ "add x28, x28, #8\n"
+ "str d9, [x18, x16]\n" // Store output (3, 3)
+ "add x18, x18, #8\n"
+ "5:\n" // Scalar
+ "cbz x20, 6f\n"
+ "ldr s17, [%[inptr0]]\n"
+ "ldr s23, [%[inptr0], %[in_col_stride1]]\n"
+ "ldr s27, [%[inptr0], x21]\n"
+ "fadd v4.4s, v23.4s, v27.4s\n"
+ "ldr s24, [%[inptr0], x22]\n"
+ "fsub v13.4s, v23.4s, v27.4s\n"
+ "ldr s11, [%[inptr0], x23]\n"
+ "fadd v10.4s, v24.4s, v11.4s\n"
+ "ldr s12, [%[inptr0], x24]\n"
+ "fsub v11.4s, v24.4s, v11.4s\n"
+ "ldr s20, [x25]\n"
+ "fadd v7.4s, v17.4s, v4.4s\n"
+ "ldr s19, [x25, %[in_col_stride1]]\n"
+ "mov v4.16b, v4.16b\n"
+ "ldr s22, [x25, x21]\n"
+ "mov v1.16b, v13.16b\n"
+ "ldr s14, [x25, x22]\n"
+ "fmul v11.4s, v11.4s, v0.s[0]\n"
+ "ldr s18, [x25, x23]\n"
+ "fadd v7.4s, v7.4s, v10.4s\n"
+ "ldr s3, [x25, x24]\n"
+ "fmla v4.4s, v10.4s, v0.s[1]\n"
+ "ldr s16, [x13]\n"
+ "fadd v2.4s, v19.4s, v22.4s\n"
+ "ldr s21, [x13, %[in_col_stride1]]\n"
+ "fadd v8.4s, v13.4s, v11.4s\n"
+ "ldr s24, [x13, x21]\n"
+ "fmla v1.4s, v11.4s, v0.s[1]\n"
+ "ldr s25, [x13, x22]\n"
+ "fadd v23.4s, v14.4s, v18.4s\n"
+ "ldr s17, [x13, x23]\n"
+ "fadd v11.4s, v20.4s, v2.4s\n"
+ "ldr s9, [x13, x24]\n"
+ "fsub v15.4s, v19.4s, v22.4s\n"
+ "ldr s6, [x26]\n"
+ "fadd v1.4s, v1.4s, v12.4s\n"
+ "ldr s19, [x26, %[in_col_stride1]]\n"
+ "fsub v31.4s, v14.4s, v18.4s\n"
+ "ldr s22, [x26, x21]\n"
+ "fadd v11.4s, v11.4s, v23.4s\n"
+ "ldr s12, [x26, x22]\n"
+ "mov v13.16b, v2.16b\n"
+ "ldr s26, [x26, x23]\n"
+ "mov v2.16b, v15.16b\n"
+ "ldr s5, [x26, x24]\n"
+ "fmul v31.4s, v31.4s, v0.s[0]\n"
+ "ldr s10, [x14]\n"
+ "fmla v13.4s, v23.4s, v0.s[1]\n"
+ "fadd v29.4s, v21.4s, v24.4s\n"
+ "fsub v14.4s, v21.4s, v24.4s\n"
+ "fadd v18.4s, v25.4s, v17.4s\n"
+ "fsub v28.4s, v25.4s, v17.4s\n"
+ "ldr s30, [x14, %[in_col_stride1]]\n"
+ "fadd v15.4s, v15.4s, v31.4s\n"
+ "fmla v2.4s, v31.4s, v0.s[1]\n"
+ "fadd v27.4s, v16.4s, v29.4s\n"
+ "mov v21.16b, v29.16b\n"
+ "fadd v20.4s, v19.4s, v22.4s\n"
+ "fsub v17.4s, v19.4s, v22.4s\n"
+ "fmul v28.4s, v28.4s, v0.s[0]\n"
+ "ldr s31, [x14, x21]\n"
+ "fadd v2.4s, v2.4s, v3.4s\n"
+ "ldr s23, [x14, x22]\n"
+ "fadd v27.4s, v27.4s, v18.4s\n"
+ "fmla v21.4s, v18.4s, v0.s[1]\n"
+ "fadd v29.4s, v12.4s, v26.4s\n"
+ "fadd v24.4s, v6.4s, v20.4s\n"
+ "fsub v16.4s, v12.4s, v26.4s\n"
+ "mov v6.16b, v20.16b\n"
+ "fadd v25.4s, v30.4s, v31.4s\n"
+ "fsub v22.4s, v30.4s, v31.4s\n"
+ "fadd v20.4s, v14.4s, v28.4s\n"
+ "mov v3.16b, v14.16b\n"
+ "fadd v24.4s, v24.4s, v29.4s\n"
+ "fmla v6.4s, v29.4s, v0.s[1]\n"
+ "fmul v16.4s, v16.4s, v0.s[0]\n"
+ "ldr s26, [x14, x23]\n"
+ "fmla v3.4s, v28.4s, v0.s[1]\n"
+ "fadd v14.4s, v23.4s, v26.4s\n"
+ "fadd v29.4s, v10.4s, v25.4s\n"
+ "fsub v23.4s, v23.4s, v26.4s\n"
+ "mov v10.16b, v25.16b\n"
+ "fadd v31.4s, v11.4s, v27.4s\n"
+ "fsub v12.4s, v11.4s, v27.4s\n"
+ "ldr s18, [x14, x24]\n"
+ "fadd v3.4s, v3.4s, v9.4s\n"
+ "ldr s19, [x27]\n"
+ "fadd v29.4s, v29.4s, v14.4s\n"
+ "fmul v23.4s, v23.4s, v0.s[0]\n"
+ "fmla v10.4s, v14.4s, v0.s[1]\n"
+ "fadd v25.4s, v7.4s, v31.4s\n"
+ "mov v26.16b, v31.16b\n"
+ "fadd v31.4s, v15.4s, v20.4s\n"
+ "fsub v11.4s, v15.4s, v20.4s\n"
+ "fadd v30.4s, v13.4s, v21.4s\n"
+ "fsub v9.4s, v13.4s, v21.4s\n"
+ "fadd v28.4s, v24.4s, v29.4s\n"
+ "fsub v24.4s, v24.4s, v29.4s\n"
+ "ldr s27, [x27, %[in_col_stride1]]\n"
+ "fadd v15.4s, v8.4s, v31.4s\n"
+ "mov v14.16b, v31.16b\n"
+ "fadd v13.4s, v4.4s, v30.4s\n"
+ "mov v4.16b, v30.16b\n"
+ "fadd v25.4s, v25.4s, v28.4s\n"
+ "fmla v26.4s, v28.4s, v0.s[1]\n"
+ "fmul v24.4s, v24.4s, v0.s[0]\n"
+ "fadd v21.4s, v6.4s, v10.4s\n"
+ "fsub v10.4s, v6.4s, v10.4s\n"
+ "fadd v6.4s, v2.4s, v3.4s\n"
+ "fsub v2.4s, v2.4s, v3.4s\n"
+ "ldr s29, [x27, x21]\n"
+ "str s25, [%[outptr0]]\n" // Store output (0, 0)
+ "fadd v20.4s, v17.4s, v16.4s\n"
+ "str s26, [x28]\n" // Store output (2, 0)
+ "mov v7.16b, v17.16b\n"
+ "fadd v17.4s, v22.4s, v23.4s\n"
+ "mov v8.16b, v22.16b\n"
+ "fadd v13.4s, v13.4s, v21.4s\n"
+ "fmul v10.4s, v10.4s, v0.s[0]\n"
+ "fmla v7.4s, v16.4s, v0.s[1]\n"
+ "ldr s28, [x27, x22]\n"
+ "fmla v8.4s, v23.4s, v0.s[1]\n"
+ "ldr s23, [x27, x23]\n"
+ "fmla v4.4s, v21.4s, v0.s[1]\n"
+ "ldr s25, [x27, x24]\n"
+ "str s13, [%[outptr0], x15]\n" // Store output (0, 2)
+ "fadd v16.4s, v27.4s, v29.4s\n"
+ "fadd v7.4s, v7.4s, v5.4s\n"
+ "fadd v30.4s, v28.4s, v23.4s\n"
+ "fadd v8.4s, v8.4s, v18.4s\n"
+ "fsub v29.4s, v27.4s, v29.4s\n"
+ "str s4, [x28, x15]\n" // Store output (2, 2)
+ "fsub v27.4s, v28.4s, v23.4s\n"
+ "fadd v19.4s, v19.4s, v16.4s\n"
+ "mov v16.16b, v16.16b\n"
+ "fadd v21.4s, v12.4s, v24.4s\n"
+ "mov v26.16b, v12.16b\n"
+ "mov v5.16b, v29.16b\n"
+ "fadd v22.4s, v20.4s, v17.4s\n"
+ "fmul v27.4s, v27.4s, v0.s[0]\n"
+ "fmla v16.4s, v30.4s, v0.s[1]\n"
+ "fadd v19.4s, v19.4s, v30.4s\n"
+ "fmla v26.4s, v24.4s, v0.s[1]\n"
+ "str s21, [x17]\n" // Store output (1, 0)
+ "fsub v20.4s, v20.4s, v17.4s\n"
+ "fadd v15.4s, v15.4s, v22.4s\n"
+ "fmla v14.4s, v22.4s, v0.s[1]\n"
+ "fadd v18.4s, v29.4s, v27.4s\n"
+ "fmla v5.4s, v27.4s, v0.s[1]\n"
+ "fadd v26.4s, v26.4s, v19.4s\n"
+ "mov v28.16b, v11.16b\n"
+ "fmul v20.4s, v20.4s, v0.s[0]\n"
+ "fadd v13.4s, v9.4s, v10.4s\n"
+ "str s15, [%[outptr0], %[output_col_stride1]]\n" // Store output (0, 1)
+ "mov v15.16b, v9.16b\n"
+ "str s14, [x28, %[output_col_stride1]]\n" // Store output (2, 1)
+ "fadd v5.4s, v5.4s, v25.4s\n"
+ "str s26, [x18]\n" // Store output (3, 0)
+ "fadd v30.4s, v7.4s, v8.4s\n"
+ "str s13, [x17, x15]\n" // Store output (1, 2)
+ "fadd v12.4s, v11.4s, v20.4s\n"
+ "fmla v28.4s, v20.4s, v0.s[1]\n"
+ "fmla v15.4s, v10.4s, v0.s[1]\n"
+ "fadd v1.4s, v1.4s, v6.4s\n"
+ "fsub v8.4s, v7.4s, v8.4s\n"
+ "mov v6.16b, v6.16b\n"
+ "mov v9.16b, v2.16b\n"
+ "str s12, [x17, %[output_col_stride1]]\n" // Store output (1, 1)
+ "fadd v28.4s, v28.4s, v18.4s\n"
+ "fadd v15.4s, v15.4s, v16.4s\n"
+ "fadd v1.4s, v1.4s, v30.4s\n"
+ "fmul v8.4s, v8.4s, v0.s[0]\n"
+ "fmla v6.4s, v30.4s, v0.s[1]\n"
+ "str s28, [x18, %[output_col_stride1]]\n" // Store output (3, 1)
+ "str s1, [%[outptr0], x16]\n" // Store output (0, 3)
+ "str s6, [x28, x16]\n" // Store output (2, 3)
+ "fadd v2.4s, v2.4s, v8.4s\n"
+ "str s15, [x18, x15]\n" // Store output (3, 2)
+ "fmla v9.4s, v8.4s, v0.s[1]\n"
+ "str s2, [x17, x16]\n" // Store output (1, 3)
+ "fadd v9.4s, v9.4s, v5.4s\n"
+ "str s9, [x18, x16]\n" // Store output (3, 3)
+ "6:\n" // End
+ : [outptr0] "+r" (output), [inptr0] "+r" (inptr)
+ : [output_col_stride1] "r" (output_col_stride * sizeof(float)), [pcoeffs] "r" (coeffs), [n_channels] "r" ((long) n_channels), [in_row_stride] "r" (6 * matrix_stride * sizeof(float)), [in_col_stride1] "r" (matrix_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float))
+ : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
+ );
+ }
+}
+
+#else
+
+template <>
+void winograd::OutputTransform<3, 3, 6, 6, float, float, winograd::WinogradRoots::Integers>::transform_tile(
+ const int n_channels,
+ const float* inptr,
+ const int matrix_stride,
+ const float* bptr,
+ float* const output,
+ const int output_row_stride,
+ const int output_col_stride
+)
+{
+ // Construct a map to the output cells
+ float *outptrs[output_tile_rows][output_tile_cols];
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+ }
+ }
+
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __arm__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][4], f[4][4], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+ // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+ // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+ // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+ // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ if (bptr != nullptr)
+ {
+ b = vld1_f32(bptr);
+ bptr += 2;
+ }
+ else
+ {
+ b = vdup_n_f32(0.0f);
+ }
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif // __arm__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][4], f[4][4], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ if (bptr != nullptr)
+ {
+ b = *(bptr++);
+ }
+ else
+ {
+ b = 0.0f;
+ }
+ for (int i = 0; i < output_tile_rows; i++)
+ {
+ for (int j = 0; j < output_tile_cols; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j] + b;
+ }
+ }
+ }
+}
+
+#endif
+
+template class OutputTransform<3, 3, 6, 6, float, float, winograd::WinogradRoots::Integers>;
+
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
similarity index 74%
rename from src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
index 58bed71..ce921ce 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,42 +22,29 @@
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "output.hpp"
+#include "arm.hpp"
-namespace
+namespace winograd
{
-template <bool Specialized, int PadRight=0>
-void winograd_output_transform_6_3_fp32_process_tile(
+template <>
+void OutputTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>::transform_tile(
const int n_channels,
- const float* const matrix_base,
+ const float* inptr,
const int matrix_stride,
- const float* const biases,
+ const float* bptr,
float* const output,
- const int output_row_stride,
- const int output_col_stride,
- const int _pad_bottom,
- const int _pad_right
+ const int, // No need to stride across rows
+ const int output_col_stride
)
{
- (void) output_row_stride;
- (void) _pad_bottom;
- constexpr int output_tile_cols = 6;
- constexpr int inner_tile_cols = 8;
-
- const int pad_right = Specialized ? PadRight : _pad_right;
- const int cells_j = output_tile_cols - pad_right;
-
// Construct a map to the output cells
- float *outptrs[cells_j];
- for (int j = 0; j < cells_j; j++)
+ float *outptrs[output_tile_cols];
+ for (int j = 0; j < output_tile_cols; j++)
{
outptrs[j] = output + j*output_col_stride;
}
- const float *inptr = matrix_base;
- const float *bptr = biases;
// For each channel of the output
int channels_remaining = n_channels;
@@ -87,7 +74,7 @@
b = vld1q_f32(bptr);
bptr += 4;
}
- for (int j = 0; j < cells_j; j++)
+ for (int j = 0; j < output_tile_cols; j++)
{
vst1q_f32(outptrs[j], f[j] + b);
outptrs[j] += 4;
@@ -118,7 +105,7 @@
b = vld1_f32(bptr);
bptr += 2;
}
- for (int j = 0; j < cells_j; j++)
+ for (int j = 0; j < output_tile_cols; j++)
{
vst1_f32(outptrs[j], f[j] + b);
outptrs[j] += 2;
@@ -149,31 +136,14 @@
{
b = *(bptr++);
}
- for (int j = 0; j < cells_j; j++)
+ for (int j = 0; j < output_tile_cols; j++)
{
*(outptrs[j]++) = f[j] + b;
}
}
}
-} // namespace (anonymous)
+template class OutputTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>;
+template class OutputTransform<3, 1, 8, 1, float, float, WinogradRoots::Integers>;
-namespace winograd
-{
-using Tiles = OutputTransformImplTiles<1, 3, 1, 8, float>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_6_3_fp32_process_tile<true>;
-
-template <>
-const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
- winograd_output_transform_6_3_fp32_process_tile<true, 1>,
- winograd_output_transform_6_3_fp32_process_tile<true, 2>,
- winograd_output_transform_6_3_fp32_process_tile<true, 3>,
- winograd_output_transform_6_3_fp32_process_tile<true, 4>,
- winograd_output_transform_6_3_fp32_process_tile<true, 5>,
-};
-
-template class OutputTransform<1, 3, 1, 8, float>;
-template class OutputTransform<3, 1, 8, 1, float>;
-} // namespace winograd
+} // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..37ae43f
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+)
+{
+ // Get pointers to each cell of the weight tensor
+ const auto weight_col_stride = n_input_channels * n_output_channels;
+ const float *inptrs[kernel_cols];
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ inptrs[j] = input + j*weight_col_stride;
+ }
+
+ // For each input channel
+ for (int ic = 0; ic < n_input_channels; ic++)
+ {
+ float *outptr = output + ic * matrix_row_stride;
+
+ // For each output channel
+ int channels_remaining = n_output_channels;
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed in this kernel
+ float w[kernel_cols], V[inner_tile_cols];
+
+ // Read weights
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[j] = *(inptrs[j]++);
+ }
+
+ // Compute V = w WT
+ V[0] = (w[0]*-1) / 36.0f;
+ V[1] = (w[1]*-1 + w[3]*-1 + w[5]*-1 + w[0]*1 + w[2]*1 + w[4]*1 + w[6]*1) / 48.0f;
+ V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1 + w[5]*1 + w[6]*1) / 48.0f;
+ V[3] = (w[0]*-1 + w[6]*-64 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8 + w[5]*32) / 120.0f;
+ V[4] = (w[0]*-1 + w[6]*-64 + w[5]*-32 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120.0f;
+ V[5] = (w[5]*-243 + w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[6]*729 + w[0]*1) / 720.0f;
+ V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[5]*243 + w[6]*729 + w[0]*1) / 720.0f;
+ V[7] = (w[6]*1) / 1.0f;
+
+ // Store the transformed weights
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ *(outptr + j*matrix_stride) = V[j];
+ }
+ outptr++;
+ }
+ }
+}
+
+template class WeightTransform<1, 7, 1, 8, float, float, WinogradRoots::Integers>;
+template class WeightTransform<7, 1, 8, 1, float, float, WinogradRoots::Integers>;
+
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..8fab6db
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input,
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+)
+{
+ constexpr int inner_tile_i = 4;
+ constexpr int inner_tile_j = 4;
+
+ // Get pointers to each cell of the weight tensor
+ const auto weight_col_stride = n_input_channels * n_output_channels;
+ const auto weight_row_stride = 3 * weight_col_stride;
+ const float *inptrs[3][3];
+ for (int i = 0; i < 3; i++)
+ {
+ for (int j = 0; j < 3; j++)
+ {
+ inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
+ }
+ }
+
+ // For each input channel
+ for (int ic = 0; ic < n_input_channels; ic++)
+ {
+ float *outptr = output + ic * matrix_row_stride;
+
+ // For each output channel
+ int channels_remaining = n_output_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used and computed in this kernel
+ float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+ // Read weights
+ for (int i = 0; i < 3; i++)
+ {
+ for (int j = 0; j < 3; j++)
+ {
+ w[i][j] = vld1q_f32(inptrs[i][j]);
+ inptrs[i][j] += 4;
+ }
+ }
+
+ // Compute the matrix W w
+ for (int j = 0; j < 3; j++)
+ {
+ Ww[0][j] = w[0][j];
+
+ // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+ Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+ // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+ Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+ Ww[3][j] = w[2][j];
+ }
+
+ // Compute V = W w WT
+ for (int i = 0; i < inner_tile_i; i++)
+ {
+ V[i][0] = Ww[i][0];
+
+ // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+ V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+ // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+ V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+ V[i][3] = Ww[i][2];
+ }
+
+ // Store the transformed weights
+ for (int i = 0, m = 0; i < inner_tile_i; i++)
+ {
+ for (int j = 0; j < inner_tile_j; j++, m++)
+ {
+ vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+ }
+ }
+ outptr += 4;
+ }
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed in this kernel
+ float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+ // Read weights
+ for (int i = 0; i < 3; i++)
+ {
+ for (int j = 0; j < 3; j++)
+ {
+ w[i][j] = vld1_f32(inptrs[i][j]);
+ inptrs[i][j] += 2;
+ }
+ }
+
+ // Compute the matrix W w
+ for (int j = 0; j < 3; j++)
+ {
+ Ww[0][j] = w[0][j];
+
+ // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+ Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+ // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+ Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+ Ww[3][j] = w[2][j];
+ }
+
+ // Compute V = W w WT
+ for (int i = 0; i < inner_tile_i; i++)
+ {
+ V[i][0] = Ww[i][0];
+
+ // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+ V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+ // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+ V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+ V[i][3] = Ww[i][2];
+ }
+
+ // Store the transformed weights
+ for (int i = 0, m = 0; i < inner_tile_i; i++)
+ {
+ for (int j = 0; j < inner_tile_j; j++, m++)
+ {
+ vst1_f32(outptr + m*matrix_stride, V[i][j]);
+ }
+ }
+ outptr += 2;
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed in this kernel
+ float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+ // Read weights
+ for (int i = 0; i < 3; i++)
+ {
+ for (int j = 0; j < 3; j++)
+ {
+ w[i][j] = *(inptrs[i][j]++);
+ }
+ }
+
+ // Compute the matrix W w
+ for (int j = 0; j < 3; j++)
+ {
+ Ww[0][j] = w[0][j];
+ Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+ Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+ Ww[3][j] = w[2][j];
+ }
+
+ // Compute V = W w WT
+ for (int i = 0; i < inner_tile_i; i++)
+ {
+ V[i][0] = Ww[i][0];
+ V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+ V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+ V[i][3] = Ww[i][2];
+ }
+
+ // Store the transformed weights
+ for (int i = 0, m = 0; i < inner_tile_i; i++)
+ {
+ for (int j = 0; j < inner_tile_j; j++, m++)
+ {
+ *(outptr + m*matrix_stride) = V[i][j];
+ }
+ }
+ outptr++;
+ }
+ }
+}
+
+template class WeightTransform<3, 3, 4, 4, float, float, WinogradRoots::Integers>;
+
+} // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..79f4fa3
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input,
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+)
+{
+ // Get pointers to each cell of the weight tensor
+ const auto weight_col_stride = n_input_channels * n_output_channels;
+ const auto weight_row_stride = 5 * weight_col_stride;
+ const float *inptrs[5][5];
+ for (int i = 0; i < 5; i++)
+ {
+ for (int j = 0; j < 5; j++)
+ {
+ inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
+ }
+ }
+
+ // For each input channel
+ for (int ic = 0; ic < n_input_channels; ic++)
+ {
+ float *outptr = output + ic * matrix_row_stride;
+
+ // For each output channel
+ int channels_remaining = n_output_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used and computed in this kernel
+ float32x4_t w[5][5], Ww[6][5], V[6][6];
+
+ // Read weights
+ for (int i = 0; i < 5; i++)
+ {
+ for (int j = 0; j < 5; j++)
+ {
+ w[i][j] = vld1q_f32(inptrs[i][j]);
+ inptrs[i][j] += 4;
+ }
+ }
+
+ // Compute the matrix W w
+ for (int j = 0; j < 5; j++)
+ {
+ // Ww[0][j] = w[0][j]/4.0f;
+ Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f);
+
+ // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+ Ww[1][j] = vmulq_n_f32(
+ vaddq_f32(
+ vaddq_f32(
+ vaddq_f32(w[1][j], w[0][j]),
+ vaddq_f32(w[3][j], w[2][j])
+ ),
+ w[4][j]
+ ),
+ -1.0f/6.0f
+ );
+
+ // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+ // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
+ Ww[2][j] = vmulq_n_f32(
+ vsubq_f32(
+ vaddq_f32(
+ vsubq_f32(w[1][j], w[0][j]),
+ vsubq_f32(w[3][j], w[2][j])
+ ),
+ w[4][j]
+ ),
+ 1.0f/6.0f
+ );
+
+ // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+ Ww[3][j] = vmulq_n_f32(
+ vmlaq_n_f32(
+ vaddq_f32(
+ vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
+ vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+ ),
+ w[4][j], 2.0f
+ ),
+ 1.0f/3.0f
+ );
+
+ // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+ Ww[4][j] = vmulq_n_f32(
+ vmlaq_n_f32(
+ vaddq_f32(
+ vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
+ vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+ ),
+ w[4][j], 2.0f
+ ),
+ 1.0f/3.0f
+ );
+
+ // Ww[5][j] = w[4][j];
+ Ww[5][j] = w[4][j];
+ }
+
+ // Compute V = W w WT
+ for (int i = 0; i < 6; i++)
+ {
+ // V[i][0] = Ww[i][0]/4.0f;
+ V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f);
+
+ // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+ V[i][1] = vmulq_n_f32(
+ vaddq_f32(
+ vaddq_f32(
+ vaddq_f32(Ww[i][1], Ww[i][0]),
+ vaddq_f32(Ww[i][3], Ww[i][2])
+ ),
+ Ww[i][4]
+ ),
+ -1.0f/6.0f
+ );
+
+ // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+ // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
+ V[i][2] = vmulq_n_f32(
+ vsubq_f32(
+ vaddq_f32(
+ vsubq_f32(Ww[i][1], Ww[i][0]),
+ vsubq_f32(Ww[i][3], Ww[i][2])
+ ),
+ Ww[i][4]
+ ),
+ 1.0f/6.0f
+ );
+
+ // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+ V[i][3] = vmulq_n_f32(
+ vmlaq_n_f32(
+ vaddq_f32(
+ vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
+ vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+ ),
+ Ww[i][4], 2.0f
+ ),
+ 1.0f/3.0f
+ );
+
+ // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+ V[i][4] = vmulq_n_f32(
+ vmlaq_n_f32(
+ vaddq_f32(
+ vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
+ vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+ ),
+ Ww[i][4], 2.0f
+ ),
+ 1.0f/3.0f
+ );
+
+ // V[i][5] = Ww[i][4];
+ V[i][5] = Ww[i][4];
+ }
+
+ // Store the transformed weights
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+ }
+ }
+ outptr += 4;
+ }
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed in this kernel
+ float32x2_t w[5][5], Ww[6][5], V[6][6];
+
+ // Read weights
+ for (int i = 0; i < 5; i++)
+ {
+ for (int j = 0; j < 5; j++)
+ {
+ w[i][j] = vld1_f32(inptrs[i][j]);
+ inptrs[i][j] += 2;
+ }
+ }
+
+ // Compute the matrix W w
+ for (int j = 0; j < 5; j++)
+ {
+ // Ww[0][j] = w[0][j]/4.0f;
+ Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f);
+
+ // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+ Ww[1][j] = vmul_n_f32(
+ vadd_f32(
+ vadd_f32(
+ vadd_f32(w[1][j], w[0][j]),
+ vadd_f32(w[3][j], w[2][j])
+ ),
+ w[4][j]
+ ),
+ -1.0f/6.0f
+ );
+
+ // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+ // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
+ Ww[2][j] = vmul_n_f32(
+ vsub_f32(
+ vadd_f32(
+ vsub_f32(w[1][j], w[0][j]),
+ vsub_f32(w[3][j], w[2][j])
+ ),
+ w[4][j]
+ ),
+ 1.0f/6.0f
+ );
+
+ // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+ Ww[3][j] = vmul_n_f32(
+ vmla_n_f32(
+ vadd_f32(
+ vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
+ vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+ ),
+ w[4][j], 2.0f
+ ),
+ 1.0f/3.0f
+ );
+
+ // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+ Ww[4][j] = vmul_n_f32(
+ vmla_n_f32(
+ vadd_f32(
+ vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
+ vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+ ),
+ w[4][j], 2.0f
+ ),
+ 1.0f/3.0f
+ );
+
+ // Ww[5][j] = w[4][j];
+ Ww[5][j] = w[4][j];
+ }
+
+ // Compute V = W w WT
+ for (int i = 0; i < 6; i++)
+ {
+ // V[i][0] = Ww[i][0]/4.0f;
+ V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f);
+
+ // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+ V[i][1] = vmul_n_f32(
+ vadd_f32(
+ vadd_f32(
+ vadd_f32(Ww[i][1], Ww[i][0]),
+ vadd_f32(Ww[i][3], Ww[i][2])
+ ),
+ Ww[i][4]
+ ),
+ -1.0f/6.0f
+ );
+
+ // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+ // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
+ V[i][2] = vmul_n_f32(
+ vsub_f32(
+ vadd_f32(
+ vsub_f32(Ww[i][1], Ww[i][0]),
+ vsub_f32(Ww[i][3], Ww[i][2])
+ ),
+ Ww[i][4]
+ ),
+ 1.0f/6.0f
+ );
+
+ // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+ V[i][3] = vmul_n_f32(
+ vmla_n_f32(
+ vadd_f32(
+ vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
+ vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+ ),
+ Ww[i][4], 2.0f
+ ),
+ 1.0f/3.0f
+ );
+
+ // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+ V[i][4] = vmul_n_f32(
+ vmla_n_f32(
+ vadd_f32(
+ vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
+ vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+ ),
+ Ww[i][4], 2.0f
+ ),
+ 1.0f/3.0f
+ );
+
+ // V[i][5] = Ww[i][4];
+ V[i][5] = Ww[i][4];
+ }
+
+ // Store the transformed weights
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ vst1_f32(outptr + m*matrix_stride, V[i][j]);
+ }
+ }
+ outptr += 2;
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed in this kernel
+ float w[5][5], Ww[6][5], V[6][6];
+
+ // Read weights
+ for (int i = 0; i < 5; i++)
+ {
+ for (int j = 0; j < 5; j++)
+ {
+ w[i][j] = *(inptrs[i][j]++);
+ }
+ }
+
+ // Compute the matrix W w
+ for (int j = 0; j < 5; j++)
+ {
+ Ww[0][j] = w[0][j]/4.0f;
+ Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+ Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+ Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+ Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+ Ww[5][j] = w[4][j];
+ }
+
+ // Compute V = W w WT
+ for (int i = 0; i < 6; i++)
+ {
+ V[i][0] = Ww[i][0]/4.0f;
+ V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+ V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+ V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+ V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+ V[i][5] = Ww[i][4];
+ }
+
+ // Store the transformed weights
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ *(outptr + m*matrix_stride) = V[i][j];
+ }
+ }
+ outptr++;
+ }
+ }
+}
+
+template class WeightTransform<5, 5, 6, 6, float, float, WinogradRoots::Integers>;
+
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..fb3d712
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+)
+{
+ // Get pointers to each cell of the weight tensor
+ const auto weight_col_stride = n_input_channels * n_output_channels;
+ const float *inptrs[kernel_cols];
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ inptrs[j] = input + j*weight_col_stride;
+ }
+
+ // For each input channel
+ for (int ic = 0; ic < n_input_channels; ic++)
+ {
+ float *outptr = output + ic * matrix_row_stride;
+
+ // For each output channel
+ int channels_remaining = n_output_channels;
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed in this kernel
+ float w[kernel_cols], V[inner_tile_cols];
+
+ // Read weights
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[j] = *(inptrs[j]++);
+ }
+
+ // Compute V = w WT
+ V[0] = (w[0]*-1) / 36;
+ V[1] = (w[1]*-1 + w[3]*-1 + w[0]*1 + w[2]*1 + w[4]*1) / 48;
+ V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1) / 48;
+ V[3] = (w[0]*-1 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8) / 120;
+ V[4] = (w[0]*-1 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120;
+ V[5] = (w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[0]*1) / 720;
+ V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[0]*1) / 720;
+ V[7] = (w[4]*1) / 1;
+
+ // Store the transformed weights
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ *(outptr + j*matrix_stride) = V[j];
+ }
+ outptr++;
+ }
+ }
+}
+
+template class WeightTransform<1, 5, 1, 8, float, float, WinogradRoots::Integers>;
+template class WeightTransform<5, 1, 8, 1, float, float, WinogradRoots::Integers>;
+
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..9e7040b
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<3, 3, 6, 6, float, float, WinogradRoots::Integers>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+)
+{
+ // Get pointers to each cell of the weight tensor
+ const auto weight_col_stride = n_input_channels * n_output_channels;
+ const auto weight_row_stride = 3 * weight_col_stride;
+ const float *inptrs[3][3];
+ for (int i = 0; i < 3; i++)
+ {
+ for (int j = 0; j < 3; j++)
+ {
+ inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
+ }
+ }
+
+ // For each input channel
+ for (int ic = 0; ic < n_input_channels; ic++)
+ {
+ float *outptr = output + ic * matrix_row_stride;
+
+ // For each output channel
+ int channels_remaining = n_output_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used and computed in this kernel
+ float32x4_t w[3][3], Ww[6][3], V[6][6];
+
+ // Read weights
+ for (int i = 0; i < 3; i++)
+ {
+ for (int j = 0; j < 3; j++)
+ {
+ w[i][j] = vld1q_f32(inptrs[i][j]);
+ inptrs[i][j] += 4;
+ }
+ }
+
+ // Compute the matrix W w
+ for (int j = 0; j < 3; j++)
+ {
+ // Ww[0][j] = 6*w[0][j];
+ Ww[0][j] = vmulq_n_f32(w[0][j], 6.0);
+
+ // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+ Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+ // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j];
+ Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+ // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j];
+ Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+ // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j];
+ Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+ // Ww[5][j] = 24*w[2][j];
+ Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f);
+ }
+
+ // Compute V = W w WT
+ for (int i = 0; i < 6; i++)
+ {
+ const float recip576 = 1.0f / 576.0f;
+
+ // V[i][0] = 6*Ww[i][0];
+ V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576);
+
+ // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+ V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+ // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2];
+ V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+ // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2];
+ V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+ // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2];
+ V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+ // V[i][5] = 24*Ww[i][2];
+ V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576);
+ }
+
+ // Store the transformed weights
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+ }
+ }
+ outptr += 4;
+ }
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed in this kernel
+ float32x2_t w[3][3], Ww[6][3], V[6][6];
+
+ // Read weights
+ for (int i = 0; i < 3; i++)
+ {
+ for (int j = 0; j < 3; j++)
+ {
+ w[i][j] = vld1_f32(inptrs[i][j]);
+ inptrs[i][j] += 2;
+ }
+ }
+
+ // Compute the matrix W w
+ for (int j = 0; j < 3; j++)
+ {
+ // Ww[0][j] = 6*w[0][j];
+ Ww[0][j] = vmul_n_f32(w[0][j], 6.0);
+
+ // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+ Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+ // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j];
+ Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+ // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j];
+ Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+ // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j];
+ Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+ // Ww[5][j] = 24*w[2][j];
+ Ww[5][j] = vmul_n_f32(w[2][j], 24.0f);
+ }
+
+ // Compute V = W w WT
+ for (int i = 0; i < 6; i++)
+ {
+ const float recip576 = 1.0f / 576.0f;
+
+ // V[i][0] = 6*Ww[i][0];
+ V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576);
+
+ // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+ V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+ // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2];
+ V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+ // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2];
+ V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+ // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2];
+ V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+ // V[i][5] = 24*Ww[i][2];
+ V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576);
+ }
+
+ // Store the transformed weights
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ vst1_f32(outptr + m*matrix_stride, V[i][j]);
+ }
+ }
+ outptr += 2;
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed in this kernel
+ float w[3][3], Ww[6][3], V[6][6];
+
+ // Read weights
+ for (int i = 0; i < 3; i++)
+ {
+ for (int j = 0; j < 3; j++)
+ {
+ w[i][j] = *(inptrs[i][j]++);
+ }
+ }
+
+ // Compute the matrix W w
+ for (int j = 0; j < 3; j++)
+ {
+ Ww[0][j] = 6*w[0][j];
+ Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+ Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j];
+ Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j];
+ Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j];
+ Ww[5][j] = 24*w[2][j];
+ }
+
+ // Compute V = W w WT
+ for (int i = 0; i < 6; i++)
+ {
+ V[i][0] = ( 6*Ww[i][0]) / 576.0;
+ V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+ V[i][2] = (-4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+ V[i][3] = ( 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]) / 576.0;
+ V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]) / 576.0;
+ V[i][5] = (24*Ww[i][2]) / 576.0;
+ }
+
+ // Store the transformed weights
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ *(outptr + m*matrix_stride) = V[i][j];
+ }
+ }
+ outptr++;
+ }
+ }
+}
+
+template class WeightTransform<3, 3, 6, 6, float, float, WinogradRoots::Integers>;
+
+} // namespace
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp
new file mode 100644
index 0000000..4572348
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "kernel.hpp"
+
+namespace winograd
+{
+
+template <>
+void WeightTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+)
+{
+ // Get pointers to each cell of the weight tensor
+ const auto weight_col_stride = n_input_channels * n_output_channels;
+ const float *inptrs[3];
+ for (int j = 0; j < 3; j++)
+ {
+ inptrs[j] = input + j*weight_col_stride;
+ }
+
+ // For each input channel
+ for (int ic = 0; ic < n_input_channels; ic++)
+ {
+ float *outptr = output + ic * matrix_row_stride;
+
+ // For each output channel
+ int channels_remaining = n_output_channels;
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed in this kernel
+ float w[3], V[inner_tile_cols];
+
+ // Read weights
+ for (int j = 0; j < 3; j++)
+ {
+ w[j] = *(inptrs[j]++);
+ }
+
+ // Compute V = w WT
+ V[0] = (w[0]*-1) / 36.0f;
+ V[1] = (w[1]*-1 + w[0]*1 + w[2]*1) / 48.0f;
+ V[2] = (w[0]*1 + w[1]*1 + w[2]*1) / 48.0f;
+ V[3] = (w[0]*-1 + w[2]*-4 + w[1]*2) / 120.0f;
+ V[4] = (w[0]*-1 + w[2]*-4 + w[1]*-2) / 120.0f;
+ V[5] = (w[1]*-3 + w[2]*9 + w[0]*1) / 720.0f;
+ V[6] = (w[1]*3 + w[2]*9 + w[0]*1) / 720.0f;
+ V[7] = (w[2]*1) / 1;
+
+ // Store the transformed weights
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ *(outptr + j*matrix_stride) = V[j];
+ }
+ outptr++;
+ }
+ }
+}
+
+template class WeightTransform<1, 3, 1, 8, float, float, WinogradRoots::Integers>;
+template class WeightTransform<3, 1, 8, 1, float, float, WinogradRoots::Integers>;
+
+} // namespace
diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
index fea635b..da6e5f6 100644
--- a/src/core/Rounding.cpp
+++ b/src/core/Rounding.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,7 +50,13 @@
}
case RoundingPolicy::TO_NEAREST_EVEN:
{
+#ifdef __aarch64__
+ asm("fcvtns %x[res], %s[value]"
+ : [res] "=r"(rounded)
+ : [value] "w"(x));
+#else // __aarch64__
ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
+#endif // __aarch64__
break;
}
default:
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 73eaf64..589b737 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -326,24 +326,30 @@
return res;
}
-PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout)
+PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation)
{
const unsigned int width_idx = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int height_idx = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const auto &strides = conv_info.stride();
const int out_width = std::ceil(float(input_shape[width_idx]) / float(strides.first));
const int out_height = std::ceil(float(input_shape[height_idx]) / float(strides.second));
- const int pad_width = ((out_width - 1) * strides.first + weights_shape[width_idx] - input_shape[width_idx]);
- const int pad_height = ((out_height - 1) * strides.second + weights_shape[height_idx] - input_shape[height_idx]);
+ const int pad_width = (out_width - 1) * strides.first + (weights_shape[width_idx] + (dilation.x() - 1) * (weights_shape[width_idx] - 1) - input_shape[width_idx]);
+ const int pad_height = (out_height - 1) * strides.second + (weights_shape[height_idx] + (dilation.y() - 1) * (weights_shape[height_idx] - 1) - input_shape[height_idx]);
const int same_pad_left = pad_width / 2;
const int same_pad_top = pad_height / 2;
const int same_pad_right = pad_width - same_pad_left;
const int same_pad_bottom = pad_height - same_pad_top;
- return PadStrideInfo(strides.first, strides.second, same_pad_left, same_pad_right, same_pad_top, same_pad_bottom, DimensionRoundingType::CEIL);
+ return { static_cast<unsigned int>(strides.first),
+ static_cast<unsigned int>(strides.second),
+ static_cast<unsigned int>(same_pad_left),
+ static_cast<unsigned int>(same_pad_right),
+ static_cast<unsigned int>(same_pad_top),
+ static_cast<unsigned int>(same_pad_bottom),
+ DimensionRoundingType::CEIL };
}
-const std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(
+std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(
unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, unsigned int padx, unsigned int pady,
unsigned int stride_x, unsigned int stride_y)
{
@@ -356,10 +362,10 @@
return std::make_pair<unsigned int, unsigned int>(w, h);
}
-const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
- unsigned int kernel_width, unsigned int kernel_height,
- const PadStrideInfo &pad_stride_info,
- const Size2D &dilation)
+std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
+ unsigned int kernel_width, unsigned int kernel_height,
+ const PadStrideInfo &pad_stride_info,
+ const Size2D &dilation)
{
const unsigned int pad_left = pad_stride_info.pad_left();
const unsigned int pad_top = pad_stride_info.pad_top();
@@ -383,18 +389,6 @@
ARM_COMPUTE_ERROR("Unsupported rounding type");
}
- // Make sure that border operations will start from inside the input and not the padded area
- if(((w - 1) * stride_x) >= (width + pad_left))
- {
- --w;
- }
- if(((h - 1) * stride_y) >= (height + pad_top))
- {
- --h;
- }
- ARM_COMPUTE_ERROR_ON(((w - 1) * stride_x) >= (width + pad_left));
- ARM_COMPUTE_ERROR_ON(((h - 1) * stride_y) >= (height + pad_top));
-
return std::make_pair(w, h);
}
diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
new file mode 100644
index 0000000..7ff2fdf
--- /dev/null
+++ b/src/core/utils/helpers/fft.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/utils/helpers/fft.h"
+
+#include <numeric>
+
+namespace arm_compute
+{
+namespace helpers
+{
+namespace fft
+{
+std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsigned int> &supported_factors)
+{
+ std::vector<unsigned int> stages;
+ unsigned int res = N;
+
+ // Early exit if no supported factors are provided
+ if(supported_factors.empty())
+ {
+ return stages;
+ }
+
+ // Create reverse iterator (Start decomposing from the larger supported factors)
+ auto rfactor_it = supported_factors.rbegin();
+
+ // Decomposition step
+ while(res != 0)
+ {
+ const unsigned int factor = *rfactor_it;
+ if(0 == (res % factor) && res >= factor)
+ {
+ stages.push_back(factor);
+ res /= factor;
+ }
+ else
+ {
+ ++rfactor_it;
+ if(rfactor_it == supported_factors.rend())
+ {
+ if(res > 1)
+ {
+ // Couldn't decompose with given factors
+ stages.clear();
+ return stages;
+ }
+ else
+ {
+ res = 0;
+ }
+ }
+ }
+ }
+
+ return stages;
+}
+
+std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vector<unsigned int> &fft_stages)
+{
+ std::vector<unsigned int> idx_digit_reverse;
+
+ // Early exit in case N and fft stages do not match
+ const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
+ if(stages_prod != N)
+ {
+ return idx_digit_reverse;
+ }
+
+ // Resize digit reverse vector
+ idx_digit_reverse.resize(N);
+
+ // Get number of radix stages
+ unsigned int n_stages = fft_stages.size();
+
+ // Scan elements
+ for(unsigned int n = 0; n < N; ++n)
+ {
+ unsigned int k = n;
+ unsigned int Nx = fft_stages[0];
+
+ // Scan stages
+ for(unsigned int s = 1; s < n_stages; ++s)
+ {
+ // radix of stage i-th
+ unsigned int Ny = fft_stages[s];
+ unsigned int Ni = Ny * Nx;
+
+ // Update k index
+ k = (k * Ny) % Ni + (k / Nx) % Ny + Ni * (k / Ni);
+
+ // Update Nx
+ Nx *= Ny;
+ }
+
+ // K is the index of digit-reverse
+ idx_digit_reverse[n] = k;
+ }
+
+ return idx_digit_reverse;
+}
+} // namespace fft
+} // namespace helpers
+} // namespace arm_compute
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index 08803c7..f6a54a5 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -114,7 +114,10 @@
Coordinates starts, Coordinates ends, Coordinates strides,
int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
{
- Coordinates starts_abs, ends_abs, final_strides;
+ Coordinates starts_abs{};
+ Coordinates ends_abs{};
+ Coordinates final_strides{};
+
for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
{
const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp
index 99236d2..055e770 100644
--- a/src/core/utils/logging/LoggerRegistry.cpp
+++ b/src/core/utils/logging/LoggerRegistry.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,12 +42,12 @@
return _instance;
}
-void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, std::vector<std::shared_ptr<Printer>> printers)
+void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
{
std::lock_guard<arm_compute::Mutex> lock(_mtx);
if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
{
- _loggers[name] = std::make_shared<Logger>(name, log_level, std::move(printers));
+ _loggers[name] = std::make_shared<Logger>(name, log_level, printers);
}
}
@@ -66,7 +66,7 @@
return (_loggers.find(name) != _loggers.end()) ? _loggers[name] : nullptr;
}
-void LoggerRegistry::create_reserved_loggers(LogLevel log_level, std::vector<std::shared_ptr<Printer>> printers)
+void LoggerRegistry::create_reserved_loggers(LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
{
std::lock_guard<arm_compute::Mutex> lock(_mtx);
for(const auto &r : _reserved_loggers)
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index ea9ba77..d606adb 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,12 +29,12 @@
using namespace arm_compute::quantization;
-constexpr int64_t fixed_point_one_Q0 = (1ll << 31);
+constexpr int64_t fixed_point_one_Q0 = (1LL << 31);
constexpr float epsilon = 0.00001f;
arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_less_than_one(float multiplier,
- int *quant_multiplier,
- int *right_shift)
+ int *quant_multiplier,
+ int *right_shift)
{
ARM_COMPUTE_RETURN_ERROR_ON(quant_multiplier == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(right_shift == nullptr);
@@ -71,8 +71,8 @@
}
arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(float multiplier,
- int *quantized_multiplier,
- int *left_shift)
+ int *quantized_multiplier,
+ int *left_shift)
{
ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr);
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 88e2682..9d437b1 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -152,7 +152,7 @@
return true;
}
-TensorID Graph::create_tensor(TensorDescriptor desc)
+TensorID Graph::create_tensor(const TensorDescriptor &desc)
{
TensorID tid = _tensors.size();
auto tensor = support::cpp14::make_unique<Tensor>(tid, desc);
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index a944d2c..5db9540 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -30,15 +30,19 @@
#include "support/ToolchainSupport.h"
-#define CHECK_NODEIDX_PAIR(pair, g) \
- ARM_COMPUTE_ERROR_ON(((pair).node_id >= (g).nodes().size()) || ((g).node((pair).node_id) == nullptr) || ((pair).index >= (g).node((pair).node_id)->num_outputs()));
-
namespace arm_compute
{
namespace graph
{
namespace
{
+inline void check_nodeidx_pair(const NodeIdxPair &pair, const Graph &g)
+{
+ ARM_COMPUTE_UNUSED(pair);
+ ARM_COMPUTE_UNUSED(g);
+ ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) || (pair.index >= g.node(pair.node_id)->num_outputs()));
+}
+
Status set_node_params(Graph &g, NodeID nid, NodeParams ¶ms)
{
INode *node = g.node(nid);
@@ -62,10 +66,10 @@
return Status{};
}
-NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
{
params.name = params.name.empty() ? "" : params.name + name;
- auto nid = GraphBuilder::add_const_node(g, params, std::move(desc), std::move(accessor));
+ auto nid = GraphBuilder::add_const_node(g, params, desc, std::move(accessor));
set_node_params(g, nid, params);
return nid;
}
@@ -73,7 +77,7 @@
template <typename NT, typename... Args>
NodeID create_simple_single_input_output_node(Graph &g, NodeParams ¶ms, NodeIdxPair input, Args &&... args)
{
- CHECK_NODEIDX_PAIR(input, g);
+ check_nodeidx_pair(input, g);
NodeID nid = g.add_node<NT>(std::forward<Args>(args)...);
g.add_connection(input.node_id, input.index, nid, 0);
@@ -81,9 +85,27 @@
return nid;
}
+
+template <typename NT, typename... Args>
+NodeID create_simple_multiple_input_single_output_node(Graph &g, NodeParams ¶ms, const std::vector<NodeIdxPair> &inputs, Args &&... args)
+{
+ ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
+
+ NodeID nid = g.add_node<NT>(std::forward<Args>(args)...);
+
+ unsigned int i = 0;
+ for(const auto &input : inputs)
+ {
+ check_nodeidx_pair(input, g);
+ g.add_connection(input.node_id, input.index, nid, i++);
+ }
+ set_node_params(g, nid, params);
+
+ return nid;
+}
} // namespace
-NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
{
auto nid = g.add_node<ConstNode>(desc);
set_node_params(g, nid, params);
@@ -91,7 +113,7 @@
return nid;
}
-NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
{
auto nid = g.add_node<InputNode>(desc);
set_node_params(g, nid, params);
@@ -101,7 +123,7 @@
NodeID GraphBuilder::add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor)
{
- CHECK_NODEIDX_PAIR(input, g);
+ check_nodeidx_pair(input, g);
NodeID nid = g.add_node<OutputNode>();
g.add_connection(input.node_id, input.index, nid, 0);
@@ -111,16 +133,17 @@
return nid;
}
-NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info)
+NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info,
+ const QuantizationInfo out_quant_info)
{
- return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info);
+ return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info, out_quant_info);
}
NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr var_accessor,
ITensorAccessorUPtr beta_accessor, ITensorAccessorUPtr gamma_accessor)
{
- CHECK_NODEIDX_PAIR(input, g);
+ check_nodeidx_pair(input, g);
bool has_beta = (beta_accessor != nullptr);
bool has_gamma = (gamma_accessor != nullptr);
@@ -170,8 +193,8 @@
NodeID GraphBuilder::add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
{
- CHECK_NODEIDX_PAIR(input, g);
- CHECK_NODEIDX_PAIR(deltas, g);
+ check_nodeidx_pair(input, g);
+ check_nodeidx_pair(deltas, g);
NodeID nid = g.add_node<BoundingBoxTransformLayerNode>(info);
@@ -194,7 +217,7 @@
const QuantizationInfo weights_quant_info,
const QuantizationInfo out_quant_info)
{
- CHECK_NODEIDX_PAIR(input, g);
+ check_nodeidx_pair(input, g);
ARM_COMPUTE_ERROR_ON(depth == 0);
ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
@@ -202,14 +225,15 @@
// Get input tensor descriptor
const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+ const DataLayout input_data_layout = input_tensor_desc.layout;
// Create weights node
TensorDescriptor w_desc = input_tensor_desc;
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) / num_groups);
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::BATCHES), depth);
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::BATCHES), depth);
if(!weights_quant_info.empty())
{
w_desc.quant_info = weights_quant_info;
@@ -248,7 +272,7 @@
Size2D inner_border, ITensorAccessorUPtr weights_accessor,
ITensorAccessorUPtr bias_accessor)
{
- CHECK_NODEIDX_PAIR(input, g);
+ check_nodeidx_pair(input, g);
ARM_COMPUTE_ERROR_ON(depth == 0);
ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
@@ -256,14 +280,15 @@
// Get input tensor descriptor
const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+ const DataLayout input_data_layout = input_tensor_desc.layout;
// Create weights node
TensorDescriptor w_desc = input_tensor_desc;
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::BATCHES), depth);
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::BATCHES), depth);
NodeID w_nid = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
@@ -293,40 +318,29 @@
return deconv_nid;
}
-NodeID GraphBuilder::add_concatenate_node(Graph &g, NodeParams params, std::vector<NodeIdxPair> inputs, DataLayoutDimension axis)
+NodeID GraphBuilder::add_concatenate_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, descriptors::ConcatLayerDescriptor concat_descriptor)
{
- ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
-
- NodeID nid = g.add_node<ConcatenateLayerNode>(inputs.size(), axis);
-
- unsigned int i = 0;
- for(const auto &input : inputs)
- {
- CHECK_NODEIDX_PAIR(input, g);
- g.add_connection(input.node_id, input.index, nid, i++);
- }
- set_node_params(g, nid, params);
-
- return nid;
+ return create_simple_multiple_input_single_output_node<ConcatenateLayerNode>(g, params, inputs, inputs.size(), concat_descriptor);
}
NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend,
PadStrideInfo conv_info, int depth_multiplier, DepthwiseConvolutionMethod method,
- ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo quant_info)
+ ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo quant_info, const QuantizationInfo out_quant_info)
{
- CHECK_NODEIDX_PAIR(input, g);
+ check_nodeidx_pair(input, g);
ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
bool has_bias = (bias_accessor != nullptr);
// Get input tensor descriptor
const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+ const DataLayout input_data_layout = input_tensor_desc.layout;
// Create weights node
TensorDescriptor w_desc = input_tensor_desc;
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
- w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+ w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
if(!quant_info.empty())
{
@@ -351,7 +365,7 @@
}
// Create convolution node and connect
- NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, depth_multiplier, method);
+ NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, depth_multiplier, method, out_quant_info);
g.add_connection(input.node_id, input.index, conv_nid, 0);
g.add_connection(w_nid, 0, conv_nid, 1);
if(has_bias)
@@ -362,11 +376,11 @@
return conv_nid;
}
-NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, DetectionOutputLayerInfo detect_info)
+NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, const DetectionOutputLayerInfo &detect_info)
{
- CHECK_NODEIDX_PAIR(input_loc, g);
- CHECK_NODEIDX_PAIR(input_conf, g);
- CHECK_NODEIDX_PAIR(input_priorbox, g);
+ check_nodeidx_pair(input_loc, g);
+ check_nodeidx_pair(input_conf, g);
+ check_nodeidx_pair(input_priorbox, g);
// Create detection_output node and connect
NodeID detect_nid = g.add_node<DetectionOutputLayerNode>(detect_info);
@@ -386,8 +400,8 @@
NodeID GraphBuilder::add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
{
- CHECK_NODEIDX_PAIR(input0, g);
- CHECK_NODEIDX_PAIR(input1, g);
+ check_nodeidx_pair(input0, g);
+ check_nodeidx_pair(input1, g);
NodeID nid = g.add_node<EltwiseLayerNode>(operation);
@@ -405,11 +419,38 @@
}
NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
+ NodeID weights_nid, NodeID bias_nid,
+ const FullyConnectedLayerInfo fc_info, const QuantizationInfo out_quant_info)
+{
+ check_nodeidx_pair(input, g);
+ ARM_COMPUTE_ERROR_ON(num_outputs == 0);
+ ARM_COMPUTE_ERROR_ON(weights_nid == EmptyNodeID);
+
+ const bool has_bias = (bias_nid != EmptyNodeID);
+
+ // Get input tensor descriptor
+ const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+ // Create fully connected node and connect
+ NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs, out_quant_info, fc_info);
+ g.add_connection(input.node_id, input.index, fc_nid, 0);
+ g.add_connection(weights_nid, 0, fc_nid, 1);
+ if(has_bias)
+ {
+ g.add_connection(bias_nid, 0, fc_nid, 2);
+ }
+
+ set_node_params(g, fc_nid, params);
+
+ return fc_nid;
+}
+
+NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
const FullyConnectedLayerInfo fc_info,
const QuantizationInfo weights_quant_info, const QuantizationInfo out_quant_info)
{
- CHECK_NODEIDX_PAIR(input, g);
+ check_nodeidx_pair(input, g);
ARM_COMPUTE_ERROR_ON(num_outputs == 0);
bool has_bias = (bias_accessor != nullptr);
@@ -450,9 +491,9 @@
NodeID GraphBuilder::add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas, NodeIdxPair anchors, GenerateProposalsInfo info)
{
- CHECK_NODEIDX_PAIR(scores, g);
- CHECK_NODEIDX_PAIR(deltas, g);
- CHECK_NODEIDX_PAIR(anchors, g);
+ check_nodeidx_pair(scores, g);
+ check_nodeidx_pair(deltas, g);
+ check_nodeidx_pair(anchors, g);
NodeID nid = g.add_node<GenerateProposalsLayerNode>(info);
@@ -472,7 +513,7 @@
NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
{
- CHECK_NODEIDX_PAIR(input, g);
+ check_nodeidx_pair(input, g);
// Get input tensor descriptor
const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
@@ -510,10 +551,10 @@
return create_simple_single_input_output_node<PoolingLayerNode>(g, params, input, pool_info);
}
-NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, PriorBoxLayerInfo prior_info)
+NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info)
{
- CHECK_NODEIDX_PAIR(input0, g);
- CHECK_NODEIDX_PAIR(input1, g);
+ check_nodeidx_pair(input0, g);
+ check_nodeidx_pair(input1, g);
// Create priorbox node and connect
NodeID prior_nid = g.add_node<PriorBoxLayerNode>(prior_info);
@@ -543,8 +584,8 @@
NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
{
- CHECK_NODEIDX_PAIR(input, g);
- CHECK_NODEIDX_PAIR(rois, g);
+ check_nodeidx_pair(input, g);
+ check_nodeidx_pair(rois, g);
NodeID nid = g.add_node<ROIAlignLayerNode>(pool_info);
@@ -557,17 +598,18 @@
NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams ¶ms, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
{
- CHECK_NODEIDX_PAIR(input, g);
+ check_nodeidx_pair(input, g);
// Get input tensor descriptor
const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+ const DataLayout input_data_layout = input_tensor_desc.layout;
// Create mul node
TensorDescriptor mul_desc = input_tensor_desc;
- const size_t C = input_tensor_desc.shape[get_dimension_idx(mul_desc, DataLayoutDimension::CHANNEL)];
- mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), 1);
- mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), 1);
- mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL), C);
+ const size_t C = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)];
+ mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), 1);
+ mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), 1);
+ mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL), C);
NodeID mul_const_nid = add_const_node_with_name(g, params, "Mul", mul_desc, std::move(mul_accessor));
NodeIdxPair mul_const_nidxp = { mul_const_nid, 0 };
@@ -599,6 +641,11 @@
return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
}
+NodeID GraphBuilder::add_stack_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, int axis)
+{
+ return create_simple_multiple_input_single_output_node<StackLayerNode>(g, params, inputs, inputs.size(), axis);
+}
+
NodeID GraphBuilder::add_upsample_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D info, InterpolationPolicy upsampling_policy)
{
return create_simple_single_input_output_node<UpsampleLayerNode>(g, params, input, info, upsampling_policy);
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index 57c5f9d..4f942b9 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,9 +45,6 @@
void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &pm, Target target)
{
- // Setup graph context if not done manually
- setup_default_graph_context(ctx);
-
// Check if graph has been registered
if(_workloads.find(graph.id()) != std::end(_workloads))
{
@@ -55,7 +52,7 @@
}
// Force target to all graph construct
- // TODO (geopin01) : Support heterogeneous execution
+ // TODO (COMPMID-2014) : Support heterogeneous execution
Target forced_target = target;
if(!is_target_supported(target))
{
@@ -64,6 +61,10 @@
}
force_target_to_graph(graph, forced_target);
+ // Setup backend context
+ // TODO (COMPMID-2014) : Setup all backends needed by the graph
+ setup_requested_backend_context(ctx, forced_target);
+
// Configure all tensors
detail::configure_all_tensors(graph);
diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index 9850128..205ef11 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -108,7 +108,7 @@
_bound_edges.erase(eid);
}
-const std::set<EdgeID> Tensor::bound_edges() const
+std::set<EdgeID> Tensor::bound_edges() const
{
return _bound_edges;
}
diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp
index e0ba7e2..b63672b 100644
--- a/src/graph/TypeLoader.cpp
+++ b/src/graph/TypeLoader.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -17,7 +17,7 @@
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWNISE, ARISING FROM,
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
@@ -100,5 +100,55 @@
}
#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
}
+
+ConvolutionMethod Convolution_method_from_name(const std::string &name)
+{
+ static const std::map<std::string, ConvolutionMethod> methods =
+ {
+ { "default", ConvolutionMethod::Default },
+ { "direct", ConvolutionMethod::Direct },
+ { "gemm", ConvolutionMethod::GEMM },
+ { "winograd", ConvolutionMethod::Winograd },
+ };
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+ try
+ {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+ return methods.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+ }
+ catch(const std::out_of_range &)
+ {
+ throw std::invalid_argument(name);
+ }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+}
+
+DepthwiseConvolutionMethod depthwise_convolution_method_from_name(const std::string &name)
+{
+ static const std::map<std::string, DepthwiseConvolutionMethod> methods =
+ {
+ { "default", DepthwiseConvolutionMethod::Default },
+ { "gemv", DepthwiseConvolutionMethod::GEMV },
+ { "optimized3x3", DepthwiseConvolutionMethod::Optimized3x3 },
+ };
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+ try
+ {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+ return methods.at(arm_compute::utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+ }
+ catch(const std::out_of_range &)
+ {
+ throw std::invalid_argument(name);
+ }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+}
+
} // namespace graph
} // namespace arm_compute
diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
index 71ec548..4c34dd8 100644
--- a/src/graph/Utils.cpp
+++ b/src/graph/Utils.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -104,13 +104,14 @@
}
}
-void setup_default_graph_context(GraphContext &ctx)
+void setup_requested_backend_context(GraphContext &ctx, Target target)
{
- for(const auto &backend : backends::BackendRegistry::get().backends())
+ if(backends::BackendRegistry::get().contains(target))
{
- if(backend.second->is_backend_supported())
+ const auto &backend = backends::BackendRegistry::get().find_backend(target);
+ if(backend->is_backend_supported())
{
- backend.second->setup_backend_context(ctx);
+ backend->setup_backend_context(ctx);
}
}
}
@@ -118,12 +119,12 @@
size_t get_dimension_size(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
{
ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
- return descriptor.shape[get_dimension_idx(descriptor, data_layout_dimension)];
+ return descriptor.shape[get_dimension_idx(descriptor.layout, data_layout_dimension)];
}
-size_t get_dimension_idx(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
+size_t get_dimension_idx(DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
{
- ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+ ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
/* Return the index based on the data layout
* [N C H W]
@@ -133,13 +134,13 @@
switch(data_layout_dimension)
{
case DataLayoutDimension::CHANNEL:
- return (descriptor.layout == DataLayout::NCHW) ? 2 : 0;
+ return (data_layout == DataLayout::NCHW) ? 2 : 0;
break;
case DataLayoutDimension::HEIGHT:
- return (descriptor.layout == DataLayout::NCHW) ? 1 : 2;
+ return (data_layout == DataLayout::NCHW) ? 1 : 2;
break;
case DataLayoutDimension::WIDTH:
- return (descriptor.layout == DataLayout::NCHW) ? 0 : 1;
+ return (data_layout == DataLayout::NCHW) ? 0 : 1;
break;
case DataLayoutDimension::BATCHES:
return 3;
diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index ae7f0a5..0666ec0 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,6 +81,11 @@
_tuner.set_tune_new_kernels(enable_tuning);
}
+void CLDeviceBackend::set_kernel_tuning_mode(CLTunerMode tuning_mode)
+{
+ _tuner.set_tuner_mode(tuning_mode);
+}
+
void CLDeviceBackend::initialize_backend()
{
// Setup Scheduler
@@ -118,6 +123,7 @@
}
set_kernel_tuning(ctx.config().use_tuner);
+ set_kernel_tuning_mode(ctx.config().tuner_mode);
// Setup a management backend
if(ctx.memory_management_ctx(Target::CL) == nullptr)
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index b9e3ddc..90c1613 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -40,7 +40,8 @@
/** Target specific information structure used to pass information to the layer templates */
struct CLTargetInfo
{
- using TensorType = arm_compute::ICLTensor;
+ using TensorType = arm_compute::ICLTensor;
+ using TensorConcreteType = CLTensor;
static Target TargetType;
};
@@ -69,6 +70,14 @@
using Subtraction = CLArithmeticSubtraction;
using Multiplication = CLPixelWiseMultiplication;
};
+
+/** Function and tensor types to be used inside a CL fused convolution/batch normalization layer */
+struct CLFusedLayerTypes
+{
+ using ConvolutionLayer = CLConvolutionLayer;
+ using FuseBatchNormalization = CLFuseBatchNormalization;
+};
+
// TODO (isagot01): Remove once we support heterogeneous scheduling at function level
/** Wrapper for the CPP Function in the OpenCL backend **/
class CPPWrapperFunction : public IFunction
@@ -192,6 +201,8 @@
return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
case NodeType::FullyConnectedLayer:
return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+ case NodeType::FusedConvolutionBatchNormalizationLayer:
+ return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node));
case NodeType::GenerateProposalsLayer:
return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
case NodeType::NormalizationLayer:
@@ -218,6 +229,8 @@
return detail::create_slice_layer<CLSlice, CLTargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
case NodeType::SoftmaxLayer:
return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+ case NodeType::StackLayer:
+ return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
case NodeType::UpsampleLayer:
return detail::create_upsample_layer<CLUpsampleLayer, CLTargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
case NodeType::YOLOLayer:
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index 4b71837..cb8dc0a 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -74,6 +74,8 @@
return detail::validate_priorbox_layer<CLPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
case NodeType::ReorgLayer:
return detail::validate_reorg_layer<CLReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
+ case NodeType::ReshapeLayer:
+ return detail::validate_reshape_layer<CLReshapeLayer>(*polymorphic_downcast<ReshapeLayerNode *>(node));
case NodeType::ROIAlignLayer:
return detail::validate_roi_align_layer<CLROIAlignLayer>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
case NodeType::SliceLayer:
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index dc987dd..690a311 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,8 @@
/** Target specific information structure used to pass information to the layer templates */
struct NETargetInfo
{
- using TensorType = arm_compute::ITensor;
+ using TensorType = arm_compute::ITensor;
+ using TensorConcreteType = arm_compute::Tensor;
static Target TargetType;
};
@@ -76,6 +77,13 @@
using Multiplication = NEPixelWiseMultiplication;
};
+/** Function and tensor types to be used inside a NEON fused convolution/batch normalization layer */
+struct NEFusedLayerTypes
+{
+ using ConvolutionLayer = NEConvolutionLayer;
+ using FuseBatchNormalization = NEFuseBatchNormalization;
+};
+
namespace detail
{
// Specialized functions
@@ -135,8 +143,10 @@
<< " Weights QuantInfo: " << weights->info()->quantization_info()
<< " Output QuantInfo: " << output->info()->quantization_info();
}
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
- << " Target " << NETargetInfo::TargetType
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+ << node.name()
+ << " Type: " << func_name
+ << " Target: " << NETargetInfo::TargetType
<< " Data Type: " << input->info()->data_type()
<< qss.str()
<< " Input shape: " << input->info()->tensor_shape()
@@ -210,6 +220,8 @@
return detail::create_flatten_layer<NEFlattenLayer, NETargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
case NodeType::FullyConnectedLayer:
return detail::create_fully_connected_layer<NEFullyConnectedLayer, NETargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+ case NodeType::FusedConvolutionBatchNormalizationLayer:
+ return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node));
case NodeType::NormalizationLayer:
return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
case NodeType::PermuteLayer:
@@ -226,6 +238,8 @@
return detail::create_resize_layer<NEScale, NETargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
case NodeType::SoftmaxLayer:
return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+ case NodeType::StackLayer:
+ return detail::create_stack_layer<NEStackLayer, NETargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
case NodeType::UpsampleLayer:
return detail::create_upsample_layer<NEUpsampleLayer, NETargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
case NodeType::YOLOLayer:
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index b0feec5..77f2e7f 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -74,6 +74,8 @@
return detail::validate_priorbox_layer<NEPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
case NodeType::ReorgLayer:
return detail::validate_reorg_layer<NEReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
+ case NodeType::ReshapeLayer:
+ return detail::validate_reshape_layer<NEReshapeLayer>(*polymorphic_downcast<ReshapeLayerNode *>(node));
case NodeType::ROIAlignLayer:
return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
case NodeType::SliceLayer:
diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index 7fc5ca0..5e31309 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -136,7 +136,7 @@
// Then add it to the list of transition buffers
ITensorHandle *tensor_handle = input_edge->tensor()->handle()->parent_handle();
IMemoryGroup *mm_group = get_memory_group_from_handle(ctx, tensor_handle);
- transition_handles.input_handles.push_back(std::make_pair(tensor_handle, mm_group));
+ transition_handles.input_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
}
}
@@ -149,7 +149,7 @@
{
ITensorHandle *tensor_handle = output_tensor->handle()->parent_handle();
IMemoryGroup *mm_group = get_memory_group_from_handle(ctx, tensor_handle);
- transition_handles.output_handles.push_back(std::make_pair(tensor_handle, mm_group));
+ transition_handles.output_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
}
}
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index 767154b..900be42 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -204,10 +204,13 @@
bool call_all_input_node_accessors(ExecutionWorkload &workload)
{
- return !std::any_of(std::begin(workload.inputs), std::end(workload.inputs), [](Tensor * input_tensor)
+ bool is_valid = true;
+ std::for_each(std::begin(workload.inputs), std::end(workload.inputs), [&](Tensor * input_tensor)
{
- return (input_tensor == nullptr) || !input_tensor->call_accessor();
+ bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor();
+ is_valid = is_valid && valid_input;
});
+ return is_valid;
}
void prepare_all_tasks(ExecutionWorkload &workload)
diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index a170c4d..7994541 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,18 +62,19 @@
// Get output tensor
auto output_tensor = node->output(0);
- // Check concatenation axis (Sub-tensor optimization is support for concatenation axis >=2)
+ // Check concatenation axis (Sub-tensor optimization is supported for concatenation axis >=2)
auto *concat_node = arm_compute::utils::cast::polymorphic_downcast<ConcatenateLayerNode *>(node);
- if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc(), concat_node->concatenation_axis()) < 2)
+ if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
{
continue;
}
- // Check that all tensor have the same target and valid inputs
+ // Check that all tensor have the same target, valid inputs and same quantization info
bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
[&](const EdgeID & eid)
{
- return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target);
+ return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target)
+ && (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
});
// Create subtensors
diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index d69d2cd..3d53f49 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,12 +47,12 @@
// Split input
const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
- const unsigned int input_idx = get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL);
+ const unsigned int input_idx = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
NodeID input_split = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
// Split weights
const TensorDescriptor weights_tensor_desc = get_tensor_descriptor(g, g.node(weights)->outputs()[0]);
- const unsigned int batch_idx = get_dimension_idx(weights_tensor_desc, DataLayoutDimension::BATCHES);
+ const unsigned int batch_idx = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
NodeID weights_split = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, batch_idx);
// Split bias
diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
index 31921b3..1c2985d 100644
--- a/src/graph/mutators/InPlaceOperationMutator.cpp
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,8 +56,8 @@
ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr || new_output_tensor == nullptr);
- // Prevent in-place operation if there is an accessor bound to the in-place tensor
- if(new_output_tensor->accessor() == nullptr)
+ // Prevent in-place operation if there is an accessor bound to the in-place tensor or quantization info are different
+ if(new_output_tensor->accessor() == nullptr || current_output_tensor->desc().quant_info == new_output_tensor->desc().quant_info)
{
ARM_COMPUTE_LOG_GRAPH_VERBOSE("Switching to in-place computation for the node with ID : "
<< node->id() << " and name : " << node->name() << std::endl);
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 9dc02d1..427d7b5 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,9 +23,11 @@
*/
#include "arm_compute/graph/mutators/NodeFusionMutator.h"
-#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphBuilder.h"
#include "arm_compute/graph/Logger.h"
#include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
#include "arm_compute/graph/nodes/Nodes.h"
#include "arm_compute/core/utils/misc/Cast.h"
@@ -38,69 +40,156 @@
{
namespace detail
{
+void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge)
+{
+ ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
+
+ auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
+ auto *bn_node = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+
+ // Not fusing if number of groups is greater than 1
+ if(conv_node->num_groups() > 1)
+ {
+ return;
+ }
+
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << output_edge->producer_id()
+ << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+
+ // Prevent fusion if fused node has an output accessor
+ if(conv_node->output(0)->accessor() == nullptr)
+ {
+ const Target assigned_target = conv_node->assigned_target();
+
+ // Extract conv inputs
+ const auto conv_input_id = conv_node->input_edge(0)->producer_id();
+ const auto conv_weights_id = conv_node->input_edge(1)->producer_id();
+ const auto out_quant_info = conv_node->output(0)->desc().quant_info;
+ const auto conv_info = conv_node->convolution_info();
+ const auto conv_method = conv_node->convolution_method();
+ const auto num_groups = conv_node->num_groups();
+ const auto act_info = bn_node->fused_activation();
+ FastMathHint fast_math_hint = conv_node->fast_math_hint();
+
+ // Extract bn inputs
+ const auto bn_mean_id = bn_node->input_edge(1)->producer_id();
+ const auto bn_var_id = bn_node->input_edge(2)->producer_id();
+ const auto bn_beta_id = bn_node->input_edge(3)->producer_id();
+ const auto bn_gamma_id = bn_node->input_edge(4)->producer_id();
+ const auto epsilon = bn_node->epsilon();
+
+ // Create the fused node
+ const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint, out_quant_info, act_info);
+
+ if(conv_node->input_edge(2) != nullptr)
+ {
+ auto conv_bias_id = conv_node->input_edge(2)->producer_id();
+ g.add_connection(conv_bias_id, 0, fused_id, 2);
+ }
+
+ // Add connections from the conv/batch_norm inputs to the fused node
+ g.add_connection(conv_input_id, 0, fused_id, 0);
+ g.add_connection(conv_weights_id, 0, fused_id, 1);
+ g.add_connection(bn_mean_id, 0, fused_id, 3);
+ g.add_connection(bn_var_id, 0, fused_id, 4);
+ g.add_connection(bn_beta_id, 0, fused_id, 5);
+ g.add_connection(bn_gamma_id, 0, fused_id, 6);
+
+ auto fused_node = g.node(fused_id);
+ std::vector<NodeIdxPair> bn_driving_nodes = get_driving_nodes(*bn_node);
+
+ // Extract batch normalization node accessor if any
+ auto bn_node_accessor = bn_node->output(0)->extract_accessor();
+ auto bn_node_name = bn_node->name();
+
+ // Remove batch normalization node
+ g.remove_node(bn_node->id());
+
+ // Get driving nodes of batch normalization node
+ for(auto &driving_node : bn_driving_nodes)
+ {
+ g.add_connection(fused_id, 0, driving_node.node_id, driving_node.index);
+ configure_tensor(fused_node->output(0));
+ }
+ // Update fused node outputs
+ fused_node->output(0)->set_accessor(std::move(bn_node_accessor));
+ fused_node->set_assigned_target(assigned_target);
+ fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + bn_node_name, assigned_target });
+
+ // Remove convolution node
+ g.remove_node(conv_node->id());
+ }
+ else
+ {
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
+ }
+}
+
template <typename N>
-void fuse_node_with_activation(Graph &g,
- const std::set<Activation> &supported_fused_activations,
- std::function<bool(INode &)> const &prec)
+void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set<Activation> &supported_fused_activations)
+{
+ ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
+
+ auto *n_node = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->producer());
+ auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(output_edge->consumer());
+
+ ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
+
+ // Check if activation is supported for fusion
+ if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
+ {
+ return;
+ }
+
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
+ << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
+
+ // Prevent fusion if fused node has an output accessor
+ if(n_node->output(0)->accessor() == nullptr)
+ {
+ // Get driving nodes of activation node
+ std::vector<NodeIdxPair> act_driving_nodes = get_driving_nodes(*act_node);
+
+ // Set activation info to fused node
+ n_node->set_fused_activation(act_node->activation_info());
+
+ // Extract activation node accessor if any
+ auto act_node_accessor = act_node->output(0)->extract_accessor();
+
+ // Remove activation node
+ g.remove_node(act_node->id());
+
+ // Update fused node outputs
+ for(auto &driving_node : act_driving_nodes)
+ {
+ g.add_connection(n_node->id(), 0, driving_node.node_id, driving_node.index);
+ }
+
+ // Update accessor to fused node
+ n_node->output(0)->set_accessor(std::move(act_node_accessor));
+ }
+ else
+ {
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
+ }
+}
+
+template <typename N1, typename N2, typename F, typename... Args>
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
{
// Not interested in the order of nodes
for(auto &node : g.nodes())
{
// Check if the node is of type N and not a branching node
- if(node && node->type() == N::node_type && node->output_edges().size() == 1)
+ if(node && node->type() == N1::node_type && node->output_edges().size() == 1)
{
- auto output_edge_id = *node->output_edges().begin();
- auto output_edge = g.edge(output_edge_id);
+ const auto output_edge_id = *node->output_edges().begin();
+ const auto output_edge = g.edge(output_edge_id);
+
// Check if following node is an activation layer node
- if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == NodeType::ActivationLayer))
+ if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
{
- auto *n_node = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->producer());
- auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(output_edge->consumer());
-
- ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
-
- // Check given precondition
- if(!prec(*n_node))
- {
- continue;
- }
- // Check if activation is supported for fusion
- if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
- {
- continue;
- }
-
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
- << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
-
- // Prevent fusion if fused node has an output accessor
- if(n_node->output(0)->accessor() == nullptr)
- {
- // Get driving nodes of activation node
- std::vector<NodeIdxPair> act_driving_nodes = get_driving_nodes(*act_node);
-
- // Set activation info to fused node
- n_node->set_fused_activation(act_node->activation_info());
-
- // Extract activation node accessor if any
- auto act_node_accessor = act_node->output(0)->extract_accessor();
-
- // Remove activation node
- g.remove_node(act_node->id());
-
- // Update fused node outputs
- for(auto &driving_node : act_driving_nodes)
- {
- g.add_connection(n_node->id(), 0, driving_node.node_id, driving_node.index);
- }
-
- // Update accessor to fused node
- n_node->output(0)->set_accessor(std::move(act_node_accessor));
- }
- else
- {
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
- }
+ fuse_fcn(g, output_edge, optional_arguments...);
}
}
}
@@ -118,20 +207,30 @@
const std::set<Activation> supported_fused_activations = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
// Preconditions
- auto empty_prec = [](INode & n)
+ auto empty_prec = [](INode &)
{
return true;
};
- auto qs8_prec = [](INode & n)
+ auto qs8_prec = [&g](INode & n)
{
ARM_COMPUTE_ERROR_ON(n.output(0) == nullptr);
- return n.output(0)->desc().data_type == DataType::QASYMM8;
+
+ const auto output_edge_id = *n.output_edges().begin();
+ const auto output_edge = g.edge(output_edge_id);
+ // To perform fusion the two nodes must have same output quantization information
+ const bool same_qinfo = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
+ const bool output_qasymm8 = n.output(0)->desc().data_type == DataType::QASYMM8;
+
+ return (output_qasymm8 && same_qinfo) || !output_qasymm8;
};
// Fusion mutations
- detail::fuse_node_with_activation<BatchNormalizationLayerNode>(g, supported_fused_activations, empty_prec);
- detail::fuse_node_with_activation<ConvolutionLayerNode>(g, supported_fused_activations, empty_prec);
- detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>(g, supported_fused_activations, qs8_prec);
+ detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
+ detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
+ detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
+
+ // TODO (COMPMID-2055): re-enable once we fuse bias and activations to convolution
+ // detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
}
} // namespace graph
} // namespace arm_compute
diff --git a/src/graph/nodes/ActivationLayerNode.cpp b/src/graph/nodes/ActivationLayerNode.cpp
index 414684c..ada6cf9 100644
--- a/src/graph/nodes/ActivationLayerNode.cpp
+++ b/src/graph/nodes/ActivationLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,8 +30,8 @@
{
namespace graph
{
-ActivationLayerNode::ActivationLayerNode(ActivationLayerInfo info)
- : _info(info)
+ActivationLayerNode::ActivationLayerNode(ActivationLayerInfo info, QuantizationInfo out_quant_info)
+ : _info(info), _out_quant_info(out_quant_info)
{
_input_edges.resize(1, EmptyEdgeID);
_outputs.resize(1, NullTensorID);
@@ -62,12 +62,18 @@
const Tensor *src = input(0);
ARM_COMPUTE_ERROR_ON(src == nullptr);
- return src->desc();
+ TensorDescriptor output_info = src->desc();
+ if(!_out_quant_info.empty())
+ {
+ output_info.quant_info = _out_quant_info;
+ }
+
+ return output_info;
}
NodeType ActivationLayerNode::type() const
{
- return NodeType::ActivationLayer;
+ return ActivationLayerNode::node_type;
}
void ActivationLayerNode::accept(INodeVisitor &v)
@@ -75,4 +81,4 @@
v.visit(*this);
}
} // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ConcatenateLayerNode.cpp b/src/graph/nodes/ConcatenateLayerNode.cpp
index ade3f6e..5f13b90 100644
--- a/src/graph/nodes/ConcatenateLayerNode.cpp
+++ b/src/graph/nodes/ConcatenateLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,8 +34,8 @@
{
namespace graph
{
-ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, DataLayoutDimension axis)
- : _total_nodes(total_nodes), _axis(axis), _is_enabled(true)
+ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, descriptors::ConcatLayerDescriptor concat_descriptor)
+ : _total_nodes(total_nodes), _concat_descriptor(std::move(concat_descriptor)), _is_enabled(true)
{
_input_edges.resize(_total_nodes, EmptyEdgeID);
_outputs.resize(1, NullTensorID);
@@ -53,7 +53,12 @@
DataLayoutDimension ConcatenateLayerNode::concatenation_axis() const
{
- return _axis;
+ return _concat_descriptor.axis;
+}
+
+QuantizationInfo ConcatenateLayerNode::output_quantization_info() const
+{
+ return _concat_descriptor.output_qinfo;
}
TensorDescriptor ConcatenateLayerNode::compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors,
@@ -62,28 +67,18 @@
ARM_COMPUTE_ERROR_ON(input_descriptors.size() == 0);
TensorDescriptor output_descriptor = input_descriptors[0];
- const int axis_idx = get_dimension_idx(output_descriptor, axis);
+ const int axis_idx = get_dimension_idx(output_descriptor.layout, axis);
+ ARM_COMPUTE_ERROR_ON_MSG(axis_idx > 2, "Unsupported concatenation axis!");
// Extract shapes
std::vector<const TensorShape *> shapes;
+ shapes.reserve(input_descriptors.size());
for(auto &input_descriptor : input_descriptors)
{
shapes.emplace_back(&input_descriptor.shape);
}
- // Calculate output shape
- if(axis_idx == 0)
- {
- output_descriptor.shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(shapes);
- }
- else if(axis_idx == 2)
- {
- output_descriptor.shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(shapes);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported concatenation axis!");
- }
+ output_descriptor.shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(shapes, axis_idx);
return output_descriptor;
}
@@ -122,7 +117,11 @@
ARM_COMPUTE_ERROR_ON(t == nullptr);
inputs_descriptors.push_back(t->desc());
}
- output_info = compute_output_descriptor(inputs_descriptors, _axis);
+ output_info = compute_output_descriptor(inputs_descriptors, _concat_descriptor.axis);
+ if(!_concat_descriptor.output_qinfo.empty())
+ {
+ output_info.quant_info = _concat_descriptor.output_qinfo;
+ }
}
return output_info;
@@ -138,4 +137,4 @@
v.visit(*this);
}
} // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
index 15c7ff6..1c8dcae 100644
--- a/src/graph/nodes/ConvolutionLayerNode.cpp
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -97,10 +97,11 @@
std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+ const DataLayout data_layout = input_descriptor.layout;
TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
return output_descriptor;
}
diff --git a/src/graph/nodes/DeconvolutionLayerNode.cpp b/src/graph/nodes/DeconvolutionLayerNode.cpp
index e7ccffd..b1a6db7 100644
--- a/src/graph/nodes/DeconvolutionLayerNode.cpp
+++ b/src/graph/nodes/DeconvolutionLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,10 +66,11 @@
info.pad().first, info.pad().second,
info.stride().first, info.stride().second);
+ const DataLayout data_layout = input_descriptor.layout;
TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
return output_descriptor;
}
diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 75ca5f4..cdd9e7b 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,8 +32,9 @@
{
namespace graph
{
-DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method)
- : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _fused_activation()
+DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method,
+ QuantizationInfo out_quant_info)
+ : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _out_quant_info(out_quant_info), _fused_activation()
{
_input_edges.resize(3, EmptyEdgeID);
_outputs.resize(1, NullTensorID);
@@ -85,10 +86,11 @@
std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+ const DataLayout data_layout = input_descriptor.layout;
TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
return output_descriptor;
}
@@ -113,7 +115,13 @@
ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
- return compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier);
+ TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier);
+ if(!_out_quant_info.empty())
+ {
+ output_info.quant_info = _out_quant_info;
+ }
+
+ return output_info;
}
NodeType DepthwiseConvolutionLayerNode::type() const
diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
new file mode 100644
index 0000000..c304a6c
--- /dev/null
+++ b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float epsilon, PadStrideInfo info,
+ unsigned int num_groups,
+ ConvolutionMethod method,
+ FastMathHint fast_math_hint,
+ QuantizationInfo out_quant_info, ActivationLayerInfo fused_activation)
+ : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info), _fused_activation(fused_activation)
+{
+ _input_edges.resize(7, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+void FusedConvolutionBatchNormalizationNode::set_convolution_method(ConvolutionMethod method)
+{
+ _method = method;
+}
+
+float FusedConvolutionBatchNormalizationNode::epsilon() const
+{
+ return _epsilon;
+}
+
+ConvolutionMethod FusedConvolutionBatchNormalizationNode::convolution_method() const
+{
+ return _method;
+}
+
+void FusedConvolutionBatchNormalizationNode::set_fast_math_hint(FastMathHint hint)
+{
+ _fast_math_hint = hint;
+}
+
+FastMathHint FusedConvolutionBatchNormalizationNode::fast_math_hint() const
+{
+ return _fast_math_hint;
+}
+
+PadStrideInfo FusedConvolutionBatchNormalizationNode::convolution_info() const
+{
+ return _info;
+}
+
+unsigned int FusedConvolutionBatchNormalizationNode::num_groups() const
+{
+ return _num_groups;
+}
+
+ActivationLayerInfo FusedConvolutionBatchNormalizationNode::fused_activation() const
+{
+ return _fused_activation;
+}
+
+void FusedConvolutionBatchNormalizationNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+ _fused_activation = fused_activation;
+}
+
+TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ const TensorDescriptor &weights_descriptor,
+ const PadStrideInfo &info)
+{
+ unsigned int output_width = 0;
+ unsigned int output_height = 0;
+
+ const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+ const unsigned int kernel_width = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
+
+ std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+
+ const DataLayout data_layout = input_descriptor.layout;
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+
+ return output_descriptor;
+}
+
+bool FusedConvolutionBatchNormalizationNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor FusedConvolutionBatchNormalizationNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ const Tensor *src = input(0);
+ const Tensor *weights = input(1);
+
+ ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
+
+ TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
+ if(!_out_quant_info.empty())
+ {
+ output_info.quant_info = _out_quant_info;
+ }
+
+ return output_info;
+}
+
+NodeType FusedConvolutionBatchNormalizationNode::type() const
+{
+ return FusedConvolutionBatchNormalizationNode::node_type;
+}
+
+void FusedConvolutionBatchNormalizationNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/PoolingLayerNode.cpp b/src/graph/nodes/PoolingLayerNode.cpp
index 26c145a..48b93c9 100644
--- a/src/graph/nodes/PoolingLayerNode.cpp
+++ b/src/graph/nodes/PoolingLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,9 +57,10 @@
std::tie(pooled_width, pooled_height) = scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info());
+ const DataLayout data_layout = input_descriptor.layout;
TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), pooled_width);
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), pooled_height);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), pooled_width);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), pooled_height);
return output_descriptor;
}
diff --git a/src/graph/nodes/ReorgLayerNode.cpp b/src/graph/nodes/ReorgLayerNode.cpp
index 6b83f6b..21ad451 100644
--- a/src/graph/nodes/ReorgLayerNode.cpp
+++ b/src/graph/nodes/ReorgLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,10 +53,11 @@
ARM_COMPUTE_ERROR_ON_MSG((input_width % stride != 0), "The width of the input tensor must be a multiple of stride");
ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0), "The height of the input tensor must be a multiple of stride");
+ const DataLayout data_layout = input_descriptor.layout;
TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), input_width / stride);
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), input_height / stride);
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), input_channel * stride * stride);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), input_width / stride);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), input_height / stride);
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channel * stride * stride);
return output_descriptor;
}
diff --git a/src/graph/nodes/ResizeLayerNode.cpp b/src/graph/nodes/ResizeLayerNode.cpp
index a6aa7bf..a399229 100644
--- a/src/graph/nodes/ResizeLayerNode.cpp
+++ b/src/graph/nodes/ResizeLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,9 +68,10 @@
const Tensor *src = input(0);
ARM_COMPUTE_ERROR_ON(src == nullptr);
+ const DataLayout data_layout = src->desc().layout;
TensorDescriptor output_desc = src->desc();
- size_t width_idx = get_dimension_idx(output_desc, DataLayoutDimension::WIDTH);
- size_t height_idx = get_dimension_idx(output_desc, DataLayoutDimension::HEIGHT);
+ size_t width_idx = get_dimension_idx(data_layout, DataLayoutDimension::WIDTH);
+ size_t height_idx = get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT);
output_desc.shape.set(width_idx, static_cast<int>(output_desc.shape[width_idx] * _scale_width));
output_desc.shape.set(height_idx, static_cast<int>(output_desc.shape[height_idx] * _scale_height));
diff --git a/src/graph/nodes/StackLayerNode.cpp b/src/graph/nodes/StackLayerNode.cpp
new file mode 100644
index 0000000..d26498a
--- /dev/null
+++ b/src/graph/nodes/StackLayerNode.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/StackLayerNode.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis)
+ : _total_nodes(total_nodes), _axis(axis)
+{
+ _input_edges.resize(_total_nodes, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+int StackLayerNode::axis() const
+{
+ return _axis;
+}
+
+TensorDescriptor StackLayerNode::compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors,
+ int axis)
+{
+ ARM_COMPUTE_ERROR_ON(input_descriptors.size() == 0);
+
+ TensorDescriptor output_descriptor = input_descriptors[0];
+
+ const TensorInfo input_info(input_descriptors[0].shape, 1, input_descriptors[0].data_type);
+ const unsigned int num_tensors = input_descriptors.size();
+
+ output_descriptor.shape = arm_compute::misc::shape_calculator::compute_stack_shape(input_info, axis, num_tensors);
+
+ return output_descriptor;
+}
+
+bool StackLayerNode::forward_descriptors()
+{
+ if(_outputs[0] != NullTensorID)
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor StackLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ // Check if all input tensors are set
+ bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
+ {
+ return eid != EmptyEdgeID;
+ });
+
+ TensorDescriptor output_info = {};
+
+ if(are_all_inputs_set)
+ {
+ std::vector<TensorDescriptor> inputs_descriptors;
+ for(unsigned int i = 0; i < _input_edges.size(); ++i)
+ {
+ const Tensor *t = _graph->tensor(input_id(i));
+ ARM_COMPUTE_ERROR_ON(t == nullptr);
+ inputs_descriptors.push_back(t->desc());
+ }
+ output_info = compute_output_descriptor(inputs_descriptors, _axis);
+ }
+
+ return output_info;
+}
+
+NodeType StackLayerNode::type() const
+{
+ return NodeType::StackLayer;
+}
+
+void StackLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/UpsampleLayerNode.cpp b/src/graph/nodes/UpsampleLayerNode.cpp
index bdd39e8..88af122 100644
--- a/src/graph/nodes/UpsampleLayerNode.cpp
+++ b/src/graph/nodes/UpsampleLayerNode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,9 +54,10 @@
const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+ const DataLayout data_layout = input_descriptor.layout;
TensorDescriptor output_descriptor = input_descriptor;
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), input_width * info.x());
- output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), input_height * info.y());
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), input_width * info.x());
+ output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), input_height * info.y());
return output_descriptor;
}
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index ef156ea..c939de1 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -77,6 +77,14 @@
_info = ss.str();
}
+void DotGraphVisitor::visit(FusedConvolutionBatchNormalizationNode &n)
+{
+ ARM_COMPUTE_UNUSED(n);
+ std::stringstream ss;
+ ss << "FusedConvolutionBatchNormalizationNode";
+ _info = ss.str();
+}
+
void DotGraphVisitor::visit(NormalizationLayerNode &n)
{
std::stringstream ss;
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index c5d42b1..1323bb3 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -66,7 +66,7 @@
std::vector<BlobInfo> group_sizes;
std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
{
- return BlobInfo(b.max_size, b.max_alignment);
+ return BlobInfo{ b.max_size, b.max_alignment };
});
// Update blob sizes
@@ -75,7 +75,7 @@
group_sizes.resize(max_size);
std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
{
- return BlobInfo(std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment));
+ return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment) };
});
// Calculate group mappings
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index 533e6fa..8bc7b8e 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -47,7 +47,7 @@
* @return A pointer to the context properties which can be used to create an opencl context
*/
-void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, cl_context_properties prop[7])
+void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop)
{
ARM_COMPUTE_UNUSED(device);
#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
@@ -55,7 +55,7 @@
if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
{
// Create a cl_context with a printf_callback and user specified buffer size.
- cl_context_properties properties_printf[] =
+ std::array<cl_context_properties, 7> properties_printf =
{
CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
// Enable a printf callback function for this context.
@@ -65,17 +65,17 @@
CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
0
};
- std::copy_n(properties_printf, 7, prop);
+ prop = properties_printf;
}
else
#endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
{
- cl_context_properties properties[] =
+ std::array<cl_context_properties, 3> properties =
{
CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
0
};
- std::copy_n(properties, 3, prop);
+ std::copy(properties.begin(), properties.end(), prop.begin());
};
}
} //namespace
@@ -94,11 +94,11 @@
std::vector<cl::Device> platform_devices;
p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
- device = platform_devices[0];
- cl_int err = CL_SUCCESS;
- cl_context_properties properties[7] = { 0, 0, 0, 0, 0, 0, 0 };
+ device = platform_devices[0];
+ cl_int err = CL_SUCCESS;
+ std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 };
initialise_context_properties(p, device, properties);
- cl::Context cl_context = cl::Context(device, properties, nullptr, nullptr, &err);
+ cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err);
ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
return std::make_tuple(cl_context, device, err);
}
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index 5bea85c..557378b 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,8 +33,8 @@
{
}
-CLMemory::CLMemory(std::shared_ptr<ICLMemoryRegion> memory)
- : _region(nullptr), _region_owned(std::move(memory))
+CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory)
+ : _region(nullptr), _region_owned(memory)
{
_region_owned = memory;
_region = _region_owned.get();
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
index 88d45ac..2577ec0 100644
--- a/src/runtime/CL/CLMultiHOG.cpp
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,8 +30,9 @@
using namespace arm_compute;
CLMultiHOG::CLMultiHOG(size_t num_models)
- : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<CLHOG[]>(_num_models))
+ : _num_models(num_models), _model()
{
+ _model.resize(_num_models);
}
size_t CLMultiHOG::num_models() const
@@ -42,11 +43,11 @@
ICLHOG *CLMultiHOG::cl_model(size_t index)
{
ARM_COMPUTE_ERROR_ON(index >= _num_models);
- return (_model.get() + index);
+ return (&_model[index]);
}
const ICLHOG *CLMultiHOG::cl_model(size_t index) const
{
ARM_COMPUTE_ERROR_ON(index >= _num_models);
- return (_model.get() + index);
+ return (&_model[index]);
}
\ No newline at end of file
diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
index 865f389..6d5dba0 100644
--- a/src/runtime/CL/CLPyramid.cpp
+++ b/src/runtime/CL/CLPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,7 @@
using namespace arm_compute;
CLPyramid::CLPyramid()
- : _info(), _pyramid(nullptr)
+ : _info(), _pyramid()
{
}
@@ -51,8 +51,8 @@
void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
{
- _info = info;
- _pyramid = arm_compute::support::cpp14::make_unique<CLTensor[]>(_info.num_levels());
+ _info = info;
+ _pyramid.resize(_info.num_levels());
size_t w = _info.width();
size_t h = _info.height();
@@ -109,11 +109,9 @@
void CLPyramid::allocate()
{
- ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
-
for(size_t i = 0; i < _info.num_levels(); ++i)
{
- (_pyramid.get() + i)->allocator()->allocate();
+ _pyramid[i].allocator()->allocate();
}
}
@@ -126,5 +124,5 @@
{
ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
- return (_pyramid.get() + index);
+ return &_pyramid[index];
}
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 0307498..101e4f1 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,7 +34,7 @@
namespace
{
-std::unique_ptr<ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
+std::unique_ptr<ICLMemoryRegion> allocate_region(const cl::Context &context, size_t size, cl_uint alignment)
{
// Try fine-grain SVM
std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(context,
@@ -101,10 +101,10 @@
info().set_is_resizable(true);
}
-arm_compute::Status CLTensorAllocator::import_memory(cl::Buffer buffer)
+Status CLTensorAllocator::import_memory(cl::Buffer buffer)
{
ARM_COMPUTE_RETURN_ERROR_ON(buffer.get() == nullptr);
- ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_SIZE>() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_SIZE>() < info().total_size());
ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_CONTEXT>().get() != CLScheduler::get().context().get());
ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index a262d6b..2c3f9ce 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
#include "arm_compute/core/CL/ICLKernel.h"
#include "arm_compute/core/Error.h"
@@ -31,42 +32,13 @@
#include <fstream>
#include <iostream>
#include <limits>
+#include <memory>
#include <string>
namespace arm_compute
{
-namespace
-{
-/** Utility function used to initialize the LWS values to test.
- * Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
- *
- * @param[in, out] lws Vector of LWS to test for a specific dimension
- * @param[in] gws Size of the GWS
- * @param[in] lws_max Max LKWS value allowed to be tested
- * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
- */
-void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
-{
- lws.push_back(1);
-
- for(unsigned int i = 2; i <= lws_max; ++i)
- {
- // Power of two condition
- const bool is_power_of_two = (i & (i - 1)) == 0;
-
- // Condition for the module accordingly with the mod_let_one flag
- const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
-
- if(mod_cond || is_power_of_two)
- {
- lws.push_back(i);
- }
- }
-}
-} // namespace
-
CLTuner::CLTuner(bool tune_new_kernels)
- : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+ : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuner_mode(CLTunerMode::NORMAL)
{
}
@@ -88,6 +60,15 @@
return _tune_new_kernels;
}
+void CLTuner::set_tuner_mode(CLTunerMode mode)
+{
+ _tuner_mode = mode;
+}
+CLTunerMode CLTuner::get_tuner_mode() const
+{
+ return _tuner_mode;
+}
+
void CLTuner::tune_kernel_static(ICLKernel &kernel)
{
ARM_COMPUTE_UNUSED(kernel);
@@ -182,61 +163,53 @@
};
CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
- cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
+ cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
- cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
+ // Run the kernel with default lws to be used as baseline
+ kernel.run(kernel.window(), queue_profiler);
+
+ queue_profiler.finish();
+
+ const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+ const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+ cl_ulong min_exec_time = end - start;
+ _kernel_event = nullptr;
+
cl::NDRange opt_lws = cl::NullRange;
- const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
- const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
- const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
-
- std::vector<unsigned int> lws_x;
- std::vector<unsigned int> lws_y;
- std::vector<unsigned int> lws_z;
-
- // Initialize the LWS values to test
- initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
- initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
- initialize_lws_values(lws_z, gws[2], lws_z_max, false);
-
- for(const auto &z : lws_z)
+ //Construct the list of LWS values to be tested based on the tuner mode.
+ auto lws_list = cl_tuner::CLLWSListFactory::get_lws_list(_tuner_mode, gws);
+ for(size_t i = 0; i < lws_list->size(); ++i)
{
- for(const auto &y : lws_y)
+ cl::NDRange lws_test = (*lws_list)[i];
+ auto x = lws_test[0];
+ auto y = lws_test[1];
+ auto z = lws_test[2];
+ const bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+
+ if(invalid_lws)
{
- for(const auto &x : lws_x)
- {
- cl::NDRange lws_test = cl::NDRange(x, y, z);
+ continue;
+ }
- bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+ //Set the Local-Workgroup-Size
+ kernel.set_lws_hint(lws_test);
- invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
+ // Run the kernel
+ kernel.run(kernel.window(), queue_profiler);
- if(invalid_lws)
- {
- continue;
- }
+ queue_profiler.finish();
- //Set the Local-Workgroup-Size
- kernel.set_lws_hint(lws_test);
+ const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+ const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+ const cl_ulong diff = end - start;
+ _kernel_event = nullptr;
- // Run the kernel
- kernel.run(kernel.window(), queue_profiler);
-
- queue_profiler.finish();
-
- const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
- const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
- const cl_ulong diff = end - start;
- _kernel_event = nullptr;
-
- // Check the execution time
- if(diff < min_exec_time)
- {
- min_exec_time = diff;
- opt_lws = cl::NDRange(x, y, z);
- }
- }
+ // Check the execution time
+ if(diff < min_exec_time)
+ {
+ min_exec_time = diff;
+ opt_lws = cl::NDRange(x, y, z);
}
}
@@ -301,7 +274,7 @@
std::ofstream fs;
fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
fs.open(filename, std::ios::out);
- for(auto kernel_data : _lws_table)
+ for(auto const &kernel_data : _lws_table)
{
fs << kernel_data.first << ";" << kernel_data.second[0] << ";" << kernel_data.second[1] << ";" << kernel_data.second[2] << std::endl;
}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 84e8709..4c7458d 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -177,7 +177,7 @@
void CLCannyEdge::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run sobel
_sobel->run();
@@ -199,6 +199,4 @@
_l1_list_counter.clear(CLScheduler::get().queue());
_l1_stack.clear(CLScheduler::get().queue());
CLScheduler::get().enqueue(_edge_trace, true);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 018c674..b8224d2 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,9 @@
*/
#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
@@ -35,56 +38,168 @@
namespace arm_compute
{
CLConcatenateLayer::CLConcatenateLayer()
- : _concat_function(nullptr)
+ : _concat_kernels(),
+ _num_inputs(0),
+ _axis(Window::DimX)
{
}
-void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, DataLayoutDimension axis)
+void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
{
ARM_COMPUTE_ERROR_ON(output == nullptr);
+ _axis = axis;
+ _num_inputs = inputs_vector.size();
- switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+ std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size());
+ std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](ICLTensor * t)
{
- case 0:
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+ ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+
+ unsigned int offset = 0;
+ switch(_axis)
+ {
+ case Window::DimX:
{
- auto func = support::cpp14::make_unique<CLWidthConcatenateLayer>();
- func->configure(inputs_vector, output);
- _concat_function = std::move(func);
+ switch(_num_inputs)
+ {
+ case 2:
+ {
+ // Configure WidthConcatenate2Tensors kernel
+ auto kernel = support::cpp14::make_unique<CLWidthConcatenate2TensorsKernel>();
+ kernel->configure(inputs_vector.at(0), inputs_vector.at(1), output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case 4:
+ {
+ // Configure WidthConcatenate4Tensors kernel
+ auto kernel = support::cpp14::make_unique<CLWidthConcatenate4TensorsKernel>();
+ kernel->configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ default:
+ {
+ // Configure generic case WidthConcatenate kernels
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = support::cpp14::make_unique<CLWidthConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ offset += inputs_vector.at(i)->info()->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
+ }
break;
}
- case 2:
+ case Window::DimY:
{
- auto func = support::cpp14::make_unique<CLDepthConcatenateLayer>();
- func->configure(inputs_vector, output);
- _concat_function = std::move(func);
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = support::cpp14::make_unique<CLHeightConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ offset += inputs_vector.at(i)->info()->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
+ case Window::DimZ:
+ {
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = support::cpp14::make_unique<CLDepthConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ offset += inputs_vector.at(i)->info()->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
break;
}
default:
- ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+ ARM_COMPUTE_ERROR("Axis not supported");
}
}
-Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+ const unsigned int num_inputs = inputs_vector.size();
- switch(get_data_layout_dimension_index(output->data_layout(), axis))
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
+
+ unsigned int offset = 0;
+ switch(axis)
{
- case 0:
- ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector, output));
+ case Window::DimX:
+ {
+ switch(num_inputs)
+ {
+ case 2:
+ // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], output));
+ break;
+ case 4:
+ // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], output));
+ break;
+ default:
+ // Validate generic case of WidthConcatenate kernel
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, offset, output));
+ offset += input->dimension(axis);
+ }
+ break;
+ }
break;
- case 2:
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayer::validate(inputs_vector, output));
+ }
+ case Window::DimY:
+ {
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLHeightConcatenateLayerKernel::validate(input, offset, output));
+ offset += input->dimension(axis);
+ }
break;
+ }
+ case Window::DimZ:
+ {
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, offset, output));
+ offset += input->dimension(axis);
+ }
+ break;
+ }
default:
- ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+ ARM_COMPUTE_ERROR("Axis not supported");
}
+
+ if(output->total_size() != 0)
+ {
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+
return Status{};
}
void CLConcatenateLayer::run()
{
- ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
- _concat_function->run();
+ for(auto &kernel : _concat_kernels)
+ {
+ CLScheduler::get().enqueue(*kernel, true);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 0131801..f09585e 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,13 +58,13 @@
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON(conv == nullptr);
- int16_t conv_col[matrix_size];
- int16_t conv_row[matrix_size];
- _is_separable = separate_matrix(conv, conv_col, conv_row, matrix_size);
+ std::array<int16_t, matrix_size> conv_col{ 0 };
+ std::array<int16_t, matrix_size> conv_row{ 0 };
+ _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
if(_is_separable)
{
- std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
+ std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
// Manage intermediate buffers
@@ -75,8 +75,8 @@
scale = calculate_matrix_scale(conv, matrix_size);
}
- _kernel_hor.configure(input, &_tmp, conv_row, border_mode == BorderMode::UNDEFINED);
- _kernel_vert.configure(&_tmp, output, conv_col, scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+ _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+ _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
_border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
// Allocate intermediate buffer
@@ -96,12 +96,10 @@
if(_is_separable)
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
CLScheduler::get().enqueue(_kernel_hor, false);
CLScheduler::get().enqueue(_kernel_vert);
-
- _memory_group.release();
}
else
{
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 0014e71..165d523 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -75,6 +75,13 @@
_function = std::move(f);
break;
}
+ case ConvolutionMethod::FFT:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLFFTConvolutionLayer>(_memory_manager);
+ f->configure(input, weights, biases, output, conv_info, act_info);
+ _function = std::move(f);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -111,6 +118,12 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups));
break;
}
+ case ConvolutionMethod::FFT:
+ {
+ // Validate FFT-based convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -169,12 +182,20 @@
return (*found).second;
}
- if(dilation != Size2D(1U, 1U) || (input->dimension(idx_c) < 16))
+ if(dilation != Size2D(1U, 1U))
{
return ConvolutionMethod::GEMM;
}
else
{
+ if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && ( CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+ {
+ return ConvolutionMethod::FFT;
+ }
+ if (input->dimension(idx_c) < 16)
+ {
+ return ConvolutionMethod::GEMM;
+ }
return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
}
}
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
new file mode 100644
index 0000000..b22809e
--- /dev/null
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLHelpers.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLCropResize.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+{
+ batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
+
+ // _crop_box_ind is used to index crop_boxes and retrieve the appropriate crop box.
+ // The crop box is specified by normalized coordinates [y0, x0, y1, x1].
+ const float x0 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(1, crop_box_ind)));
+ const float y0 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(0, crop_box_ind)));
+ const float x1 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(3, crop_box_ind)));
+ const float y1 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(2, crop_box_ind)));
+ // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
+ start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+ end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+ const TensorShape out_shape(input->info()->tensor_shape()[0], abs(end[0] - start[0]) + 1, abs(end[1] - start[1]) + 1);
+ output->info()->set_tensor_shape(out_shape);
+}
+
+inline void run_crop(const ICLTensor *input, ICLTensor *output, uint32_t batch_index, Coordinates start, Coordinates end, float extrapolation_value)
+{
+ bool is_width_flipped = end[0] < start[0];
+ bool is_height_flipped = end[1] < start[1];
+ /** The number of rows out of bounds at the start and end of output. */
+ std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+ /** The number of columns out of bounds at the start and end of output. */
+ std::array<int32_t, 2> cols_out_of_bounds{ 0 };
+ if(is_height_flipped)
+ {
+ rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(start[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
+ rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
+ }
+ else
+ {
+ rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
+ rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(end[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
+ }
+ if(is_width_flipped)
+ {
+ cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(start[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
+ cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
+ }
+ else
+ {
+ cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
+ cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(end[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
+ }
+
+ Window full_window = calculate_max_window(*output->info());
+
+ // Full output window:
+ // --------------------------------
+ // | Out of bounds |
+ // | rows before |
+ // |------------------------------|
+ // | Out of | In | Out of |
+ // | bounds | bounds | bounds |
+ // | cols | elements | cols |
+ // | before | copied | after |
+ // | | from input | |
+ // |------------------------------|
+ // | Out of bounds |
+ // | rows after |
+ // |------------------------------|
+ // Use a separate output window for each section of the full output window.
+ // Fill all output rows that have no elements that are within the input bounds
+ // with the extrapolation value using memset.
+ // First for the rows before the in bounds rows.
+ if(rows_out_of_bounds[0] > 0)
+ {
+ Window slice_fill_rows_before(full_window);
+ slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(output, extrapolation_value, &slice_fill_rows_before);
+ CLScheduler::get().enqueue(*kernel);
+ }
+
+ Window slice_in(full_window);
+ slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], output->info()->dimension(2) - rows_out_of_bounds[1], 1));
+ slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], output->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+ int rows_in_bounds = static_cast<int32_t>(output->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
+ if(rows_in_bounds > 0)
+ {
+ // Fill all elements that share a row with an in bounds element with the extrapolation value.
+ if(cols_out_of_bounds[0] > 0)
+ {
+ Window slice_fill_cols_before(slice_in);
+ slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(output, extrapolation_value, &slice_fill_cols_before);
+ CLScheduler::get().enqueue(*kernel);
+ }
+
+ if(cols_out_of_bounds[1] > 0)
+ {
+ Window slice_fill_cols_after(slice_in);
+ slice_fill_cols_after.set(1, Window::Dimension(output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1), 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(output, extrapolation_value, &slice_fill_cols_after);
+ CLScheduler::get().enqueue(*kernel);
+ }
+
+ // Copy all elements within the input bounds from the input tensor.
+ int cols_in_bounds = static_cast<int32_t>(output->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
+ if(cols_in_bounds > 0)
+ {
+ Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+ is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
+ Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+ is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+ auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>();
+
+ kernel->configure(input, output, start_in, end_in, batch_index, extrapolation_value, &slice_in);
+ CLScheduler::get().enqueue(*kernel);
+ }
+ }
+
+ // Fill all rows after the in bounds elements with the extrapolation value.
+ if(rows_out_of_bounds[1] > 0)
+ {
+ Window slice_fill_rows_after(full_window);
+ slice_fill_rows_after.set(2, Window::Dimension(output->info()->dimension(2) - rows_out_of_bounds[1], output->info()->dimension(2), 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(output, extrapolation_value, &slice_fill_rows_after);
+ CLScheduler::get().enqueue(*kernel);
+ }
+}
+} // namespace
+
+CLCropResize::CLCropResize()
+ : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results()
+{
+}
+
+Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
+ Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
+ ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
+ TensorInfo temp_info;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCropKernel::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ TensorShape out_shape(input->tensor_shape()[0], crop_size.x, crop_size.y, boxes->tensor_shape()[1]);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), out_shape);
+ }
+ return Status{};
+}
+
+void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
+ InterpolationPolicy method, float extrapolation_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+
+ _num_boxes = boxes->info()->tensor_shape()[1];
+ TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
+
+ _input = input;
+ _boxes = boxes;
+ _box_ind = box_ind;
+ _output = output;
+ _method = method;
+ _extrapolation_value = extrapolation_value;
+
+ // For each crop box:
+ // - The initial cropped image is produced as specified by boxes[i] from the 3D image input[box_ind[i]].
+ // Possibly using a CLCropKernel and up to four CLMemsetKernels.
+ // - A tensor is required to hold this initial cropped image.
+ // - A scale function is used to resize the cropped image to the size specified by crop_size.
+ // - A tensor is required to hold the final scaled image before it is copied into the 4D output
+ // that will hold all final cropped and scaled 3D images using CLCopyKernel.
+ for(unsigned int i = 0; i < _num_boxes; ++i)
+ {
+ auto crop_tensor = support::cpp14::make_unique<CLTensor>();
+ TensorInfo crop_result_info(1, DataType::F32);
+ crop_result_info.set_data_layout(DataLayout::NHWC);
+ crop_tensor->allocator()->init(crop_result_info);
+ _crop_results.emplace_back(std::move(crop_tensor));
+
+ auto scale_tensor = support::cpp14::make_unique<CLTensor>();
+ TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
+ scaled_result_info.set_data_layout(DataLayout::NHWC);
+ scale_tensor->allocator()->init(scaled_result_info);
+ _scaled_results.emplace_back(std::move(scale_tensor));
+ }
+}
+
+void CLCropResize::run()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
+ // The contents of _boxes and _box_ind are required to calculate the shape
+ // of the initial cropped image and thus are required to configure the
+ // kernels used for cropping and scaling.
+ _boxes->map(CLScheduler::get().queue());
+ _box_ind->map(CLScheduler::get().queue());
+ for(unsigned int i = 0; i < _num_boxes; ++i)
+ {
+ // Size of the crop box in _boxes and thus the shape of _crop_results[i]
+ // may not be known until run-time and so the kernels cannot be configured until then.
+ uint32_t batch_index;
+ Coordinates start{};
+ Coordinates end{};
+ configure_crop(_input, _boxes, _box_ind, _crop_results[i].get(), i, start, end, batch_index);
+
+ auto scale_kernel = support::cpp14::make_unique<CLScale>();
+ scale_kernel->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
+ _scale.emplace_back(std::move(scale_kernel));
+
+ Window win = calculate_max_window(*_output->info());
+ win.set(3, Window::Dimension(i, i + 1, 1));
+
+ auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
+ copy_kernel->configure(_scaled_results[i].get(), _output, PaddingList(), &win);
+ _copy.emplace_back(std::move(copy_kernel));
+
+ _crop_results[i]->allocator()->allocate();
+ _scaled_results[i]->allocator()->allocate();
+
+ run_crop(_input, _crop_results[i].get(), batch_index, start, end, _extrapolation_value);
+ }
+ _boxes->unmap(CLScheduler::get().queue());
+ _box_ind->unmap(CLScheduler::get().queue());
+ CLScheduler::get().sync();
+ for(auto &kernel : _scale)
+ {
+ kernel->run();
+ }
+ CLScheduler::get().sync();
+ for(auto &kernel : _copy)
+ {
+ CLScheduler::get().enqueue(*kernel, true);
+ }
+ CLScheduler::get().sync();
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 9da02c1..c6f79d3 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -23,188 +23,117 @@
*/
#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#include <cmath>
#include <memory>
#include <tuple>
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
-CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
- : _memory_group(std::move(memory_manager)),
- _scale_f(),
- _conv_f(),
- _flip_weights(),
- _scaled_output(),
- _original_weights(nullptr),
- _weights_flipped(),
- _is_prepared(false)
+CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_manager(std::move(memory_manager)), _function()
{
}
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+ unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
+
+ switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+ {
+ case DeconvolutionMethod::DIRECT:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLDirectDeconvolutionLayer>();
+ f->configure(input, weights, bias, output, deconv_info, weights_info);
+ _function = std::move(f);
+ break;
+ }
+ case DeconvolutionMethod::GEMM:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+ f->configure(input, weights, bias, output, deconv_info);
+ _function = std::move(f);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+ ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
+
+ switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+ {
+ case DeconvolutionMethod::DIRECT:
+ {
+ // Validate direct convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+ break;
+ }
+ case DeconvolutionMethod::GEMM:
+ {
+ // Validate gemm-based convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+
+ return Status{};
+}
+
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_UNUSED(output, bias, weights_info);
const DataLayout data_layout = input->data_layout();
const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
- ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
-
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
-
- auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
- info.pad().first, info.pad().second, stride_x, stride_y);
-
- const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
- if(bias != nullptr)
+ if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
{
- if(is_data_type_quantized_asymmetric(input->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+ return DeconvolutionMethod::DIRECT;
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
-
- unsigned int padx = 0;
- unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
- TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
-
- return Status{};
+ return DeconvolutionMethod::GEMM;
}
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
- unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
-
- const DataLayout data_layout = input->info()->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- _original_weights = weights;
- _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
- _flip_weights.configure(weights, &_weights_flipped);
-
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
- info.pad().first, info.pad().second, stride_x, stride_y);
-
- const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
-
- _is_prepared = weights_info.retain_internal_weights();
-
- _memory_group.manage(&_scaled_output);
-
- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
- unsigned int padx = 0;
- unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
-
- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
- scale_out_info.set_data_layout(data_layout);
- _scaled_output.allocator()->init(scale_out_info);
-
- // configure scale function
- const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
- _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), upsample_info);
-
- // setup the function to convolve the upscaled output
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
- _scaled_output.allocator()->allocate();
-}
-
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
const WeightsInfo &weights_info)
{
- configure(input, weights, bias, output, info, 0, 0, weights_info);
+ configure(input, weights, bias, output, deconv_info, 0, 0, weights_info);
}
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
const WeightsInfo &weights_info)
{
- return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+ return CLDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, 0, 0, weights_info);
}
void CLDeconvolutionLayer::run()
{
prepare();
-
- _memory_group.acquire();
-
- _scale_f.run();
- _conv_f.run();
-
- _memory_group.release();
+ _function->run();
}
void CLDeconvolutionLayer::prepare()
{
- if(!_is_prepared)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- // Run weights flipping and mark original weights tensor as unused
- _weights_flipped.allocator()->allocate();
- _weights_flipped.map(true);
- _original_weights->map(CLScheduler::get().queue(), true);
- CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
- _weights_flipped.unmap();
- _original_weights->unmap(CLScheduler::get().queue());
- _original_weights->mark_as_unused();
-
- // Prepare convolution
- _conv_f.prepare();
-
- if(!_weights_flipped.is_used())
- {
- _weights_flipped.allocator()->free();
- }
-
- _is_prepared = true;
- }
+ _function->prepare();
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index ce8667d..c66dff0 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,14 +27,11 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
: _upsample(),
+ _memset(),
_output(nullptr)
{
}
@@ -51,22 +48,13 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_output = output;
+ _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
_upsample.configure(input, _output, inner_border, info);
}
void CLDeconvolutionLayerUpsample::run()
{
- _output->map(CLScheduler::get().queue(), true);
- if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
- {
- const uint8_t quantized_zero = _output->info()->quantization_info().offset;
- std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
- }
- else
- {
- memset(_output->buffer(), 0, _output->info()->total_size());
- }
- _output->unmap(CLScheduler::get().queue());
-
- CLScheduler::get().enqueue(_upsample, false);
+ CLScheduler::get().enqueue(_memset, false);
+ CLScheduler::get().enqueue(_upsample, true);
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index e46647a..f687e54 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -36,8 +36,7 @@
using namespace arm_compute;
CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
- : _inputs_vector(),
- _concat_kernels_vector(),
+ : _concat_kernels_vector(),
_border_handlers_vector(),
_num_inputs(0)
{
@@ -53,10 +52,10 @@
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLDepthConcatenateLayerKernel[]>(_num_inputs);
- _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+ _concat_kernels_vector.resize(_num_inputs);
+ _border_handlers_vector.resize(_num_inputs);
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -82,7 +81,7 @@
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
unsigned int depth_offset = 0;
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 15cbfce..97b0a01 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -45,10 +45,18 @@
}
void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- ActivationLayerInfo act_info)
+ ActivationLayerInfo act_info, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ // idx_w and idx_h only used for validation
+ const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_UNUSED(idx_w);
+ ARM_COMPUTE_UNUSED(idx_h);
+
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
@@ -62,11 +70,13 @@
const ICLTensor *weights_to_use = weights;
ICLTensor *output_to_use = output;
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+
DepthwiseConvolutionReshapeInfo info;
info.c0 = 4;
- info.transpose = is_stride_1 && is_dot8_supported;
+ info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
if(_needs_permute)
{
@@ -103,7 +113,7 @@
// Configure kernel
_kernel->set_target(CLScheduler::get().target());
- _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
+ _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info, dilation);
// Permute output if needed
if(_needs_permute)
@@ -126,26 +136,26 @@
}
Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- ActivationLayerInfo act_info, GPUTarget gpu_target)
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
- const bool needs_permute = is_nhwc && (depth_multiplier > 1);
- const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ const bool needs_permute = is_nhwc && (depth_multiplier > 1);
+ const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
DepthwiseConvolutionReshapeInfo info;
info.c0 = 4;
- info.transpose = is_stride_1 && is_dot8_supported;
+ info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
if(needs_permute)
{
TensorShape permuted_input_shape = input->tensor_shape();
TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
@@ -155,7 +165,8 @@
const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target,
+ dilation));
}
else if(is_nhwc)
{
@@ -163,13 +174,13 @@
{
auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
- act_info));
+ act_info, dilation));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation));
}
return Status{};
@@ -179,7 +190,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_needs_permute)
{
@@ -192,8 +203,6 @@
{
_permute_output_to_nhwc.run();
}
-
- _memory_group.release();
}
void CLDepthwiseConvolutionLayer3x3::prepare()
@@ -229,7 +238,7 @@
}
void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -238,12 +247,15 @@
const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
if(bool(can_run_optimised_3x3_kernel))
{
auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
- f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
_optimised_function = std::move(f);
}
else
@@ -262,7 +274,7 @@
const GPUTarget gpu_target = CLScheduler::get().target();
// Calculate output shape
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -283,7 +295,7 @@
shape_im2col.set(2, weights_z);
_input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
_im2col_kernel.set_target(gpu_target);
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
CLScheduler::get().tune_kernel_static(_im2col_kernel);
// Weights reshape configuration
@@ -310,7 +322,8 @@
const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
_output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
_output_reshaped.allocator()->allocate();
@@ -345,11 +358,14 @@
}
Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
if(can_run_optimised_3x3_kernel)
@@ -361,7 +377,7 @@
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const bool append_bias = (biases != nullptr) && !is_quantized;
- const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
const size_t weights_w = weights->dimension(idx_w);
const size_t weights_h = weights->dimension(idx_h);
const size_t weights_z = weights->dimension(idx_c);
@@ -375,7 +391,7 @@
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
const TensorShape shape_weights_reshape(patch_size, weights_z);
TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
@@ -405,7 +421,7 @@
}
else
{
- CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation);
}
return Status{};
}
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 6f33b2e..cdfdfc7 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,36 +21,22 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
-CLDequantizationLayer::CLDequantizationLayer()
- : _dequantize_kernel()
+namespace arm_compute
{
+void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLDequantizationLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
}
-Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(input, output, min_max));
-
- return Status{};
+ return CLDequantizationLayerKernel::validate(input, output);
}
-
-void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-
- _dequantize_kernel.configure(input, output, min_max);
-}
-
-void CLDequantizationLayer::run()
-{
- // Run dequantization kernel
- CLScheduler::get().enqueue(_dequantize_kernel, false);
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
new file mode 100644
index 0000000..6e14e26
--- /dev/null
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+
+CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _scale_f(),
+ _conv_f(),
+ _flip_weights(),
+ _scaled_output(),
+ _original_weights(nullptr),
+ _weights_flipped(),
+ _flip_axis(),
+ _is_prepared(false)
+{
+}
+
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+ info.pad().first, info.pad().second, stride_x, stride_y);
+
+ const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+ if(bias != nullptr)
+ {
+ if(is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
+
+ unsigned int padx = 0;
+ unsigned int pady = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, 0, 0, out_dims, padx, pady);
+ TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(), info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+
+ return Status{};
+}
+
+void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ const DataLayout data_layout = input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ _original_weights = weights;
+ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+ _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+
+ auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
+ info.pad().first, info.pad().second, stride_x, stride_y);
+
+ const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+
+ _is_prepared = weights_info.retain_internal_weights();
+
+ _memory_group.manage(&_scaled_output);
+
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+ unsigned int padx = 0;
+ unsigned int pady = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, 0, 0, out_dims, padx, pady);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ scale_out_info.set_data_layout(data_layout);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ // configure scale function
+ const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+ _scale_f.configure(input, &_scaled_output, BorderSize(), upsample_info);
+
+ // Setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+ _scaled_output.allocator()->allocate();
+
+ // Setup flip axis data
+ _flip_axis.allocator()->allocate();
+ _flip_axis.map(true);
+ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+ if(weights->info()->data_layout() == DataLayout::NHWC)
+ {
+ axis_data[0] = 1;
+ axis_data[1] = 2;
+ }
+ else
+ {
+ axis_data[0] = 0;
+ axis_data[1] = 1;
+ }
+ _flip_axis.unmap();
+}
+
+void CLDirectDeconvolutionLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _scale_f.run();
+ _conv_f.run();
+}
+
+void CLDirectDeconvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+ _flip_weights.run();
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
+ _conv_f.prepare();
+
+ // Free flipped weights
+ if(!_weights_flipped.is_used())
+ {
+ _weights_flipped.allocator()->free();
+ }
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
new file mode 100644
index 0000000..49b5a2a
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFT1D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _run_scale(false)
+{
+}
+
+void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
+
+ // Decompose size to radix factors
+ const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->info()->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_ERROR_ON(decomposed_vector.empty());
+
+ // Flags
+ _run_scale = config.direction == FFTDirection::Inverse;
+ const bool is_c2r = input->info()->num_channels() == 2 && output->info()->num_channels() == 1;
+
+ // Configure digit reverse
+ FFTDigitReverseKernelInfo digit_reverse_config;
+ digit_reverse_config.axis = config.axis;
+ digit_reverse_config.conjugate = config.direction == FFTDirection::Inverse;
+ TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
+ _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
+ _memory_group.manage(&_digit_reversed_input);
+ _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+
+ // Create and configure FFT kernels
+ unsigned int Nx = 1;
+ _num_ffts = decomposed_vector.size();
+ _fft_kernels.resize(_num_ffts);
+ for(unsigned int i = 0; i < _num_ffts; ++i)
+ {
+ const unsigned int radix_for_stage = decomposed_vector.at(i);
+
+ FFTRadixStageKernelInfo fft_kernel_info;
+ fft_kernel_info.axis = config.axis;
+ fft_kernel_info.radix = radix_for_stage;
+ fft_kernel_info.Nx = Nx;
+ fft_kernel_info.is_first_stage = (i == 0);
+ _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+
+ Nx *= radix_for_stage;
+ }
+
+ // Configure scale kernel
+ if(_run_scale)
+ {
+ FFTScaleKernelInfo scale_config;
+ scale_config.scale = static_cast<float>(N);
+ scale_config.conjugate = config.direction == FFTDirection::Inverse;
+ is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+ }
+
+ // Allocate tensors
+ _digit_reversed_input.allocator()->allocate();
+ _digit_reverse_indices.allocator()->allocate();
+
+ // Init digit reverse indices
+ const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector);
+ _digit_reverse_indices.map(CLScheduler::get().queue(), true);
+ std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast<unsigned int *>(_digit_reverse_indices.buffer()));
+ _digit_reverse_indices.unmap(CLScheduler::get().queue());
+}
+
+Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+
+ // Check if FFT is decomposable
+ const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+void CLFFT1D::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Run digit reverse
+ CLScheduler::get().enqueue(_digit_reverse_kernel, false);
+
+ // Run radix kernels
+ for(unsigned int i = 0; i < _num_ffts; ++i)
+ {
+ CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
+ }
+
+ // Run output scaling
+ if(_run_scale)
+ {
+ CLScheduler::get().enqueue(_scale_kernel, true);
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
new file mode 100644
index 0000000..165e784
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFT2D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+{
+}
+
+void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
+
+ // Setup first pass
+ FFT1DInfo first_pass_config;
+ first_pass_config.axis = config.axes.first;
+ first_pass_config.direction = config.direction;
+ _memory_group.manage(&_first_pass_tensor);
+ _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+
+ // Setup second pass
+ FFT1DInfo second_pass_config;
+ second_pass_config.axis = config.axes.second;
+ second_pass_config.direction = config.direction;
+ _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+ _first_pass_tensor.allocator()->allocate();
+}
+
+Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ // Create intermediate tensor info
+ TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2));
+
+ // Validate first pass
+ FFT1DInfo first_pass_config;
+ first_pass_config.axis = config.axes.first;
+ first_pass_config.direction = config.direction;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(input, &first_pass_tensor, first_pass_config));
+
+ // Validate second pass
+ FFT1DInfo second_pass_config;
+ second_pass_config.axis = config.axes.second;
+ second_pass_config.direction = config.direction;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+void CLFFT2D::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _first_pass_func.run();
+ _second_pass_func.run();
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
new file mode 100644
index 0000000..afb1cab
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+int pad_decomposable(int N)
+{
+ const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
+
+ int pad = 0;
+ bool is_decomposed = false;
+ while(!is_decomposed)
+ {
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
+ is_decomposed = !decomposed_vector.empty();
+ if(!is_decomposed)
+ {
+ ++pad;
+ }
+ }
+ return pad;
+}
+} // namespace
+CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager),
+ _flip_weights_func(),
+ _permute_input_func(),
+ _permute_output_func(),
+ _permute_weights_func(),
+ _permute_bias_func(),
+ _pad_input_func(),
+ _pad_weights_func(),
+ _transform_input_func(memory_manager),
+ _transform_weights_func(),
+ _itransform_output_func(memory_manager),
+ _prod_func(),
+ _reduce_func(),
+ _extract_output_func(),
+ _bias_add_func(),
+ _activation_layer_func(),
+ _permuted_input(),
+ _permuted_weights(),
+ _permuted_bias(),
+ _permuted_output(),
+ _padded_input(),
+ _padded_weights(),
+ _flip_axis(),
+ _flipped_weights(),
+ _transformed_input(),
+ _transformed_weights(),
+ _input_weights_product(),
+ _output_product(),
+ _output_reduced(),
+ _itransformed_output(),
+ _reshaped_output(),
+ _bias_output(),
+ _original_weights(nullptr),
+ _original_bias(nullptr),
+ _is_activationlayer_enabled(false),
+ _needs_permute(false),
+ _has_bias(false),
+ _is_prepared(false)
+{
+}
+
+void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ _original_weights = weights;
+ _original_bias = biases;
+
+ // Flat if bias addition is required
+ _has_bias = biases != nullptr;
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+ pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+ // Tensors to use
+ ICLTensor *input_to_use = input;
+ const ICLTensor *weights_to_use = weights;
+ ICLTensor *output_to_use = _has_bias ? &_bias_output : output;
+
+ // Permute bias
+ if(biases != nullptr)
+ {
+ _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+ _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
+ }
+
+ // Permute input if needed
+ _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
+ if(_needs_permute)
+ {
+ _memory_group.manage(&_permuted_input);
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ }
+
+ // Flip weights
+ _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
+ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+ _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+
+ // Pad weights
+ const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+ _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+
+ // Transform weights
+ _transform_weights_func = support::cpp14::make_unique<CLFFT2D>();
+ _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+
+ // Pad input
+ const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+ _memory_group.manage(&_padded_input);
+ _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+ if(_needs_permute)
+ {
+ _permuted_input.allocator()->allocate();
+ }
+
+ // Transform input
+ _memory_group.manage(&_transformed_input);
+ _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+ _padded_input.allocator()->allocate();
+
+ // Perform product
+ _memory_group.manage(&_output_product);
+ _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+ _transformed_input.allocator()->allocate();
+
+ // Perform reduction
+ _memory_group.manage(&_output_reduced);
+ _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+ _output_product.allocator()->allocate();
+
+ // Transform output
+ _memory_group.manage(&_itransformed_output);
+ FFT2DInfo itranform_info;
+ itranform_info.direction = FFTDirection::Inverse;
+ _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+ _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+ _output_reduced.allocator()->allocate();
+
+ // Reshape output
+ TensorShape reshaped_shape = _itransformed_output.info()->tensor_shape();
+ reshaped_shape.remove_dimension(2);
+ _reshaped_output.allocator()->init(_itransformed_output.info()->clone()->set_tensor_shape(reshaped_shape));
+
+ // Extract correct region
+ const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
+ const int start_top = kernel_size.y() - conv_info.pad_top() - 1;
+ const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+ const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+ if(_has_bias)
+ {
+ _memory_group.manage(&_bias_output);
+ }
+ else if(_needs_permute)
+ {
+ output_to_use = &_permuted_output;
+ _memory_group.manage(&_permuted_output);
+ }
+ _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+ _itransformed_output.allocator()->allocate();
+
+ // Add bias
+ if(biases != nullptr)
+ {
+ output_to_use = output;
+ if(_needs_permute)
+ {
+ output_to_use = &_permuted_output;
+ _memory_group.manage(&_permuted_output);
+ }
+ auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
+ _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+ _bias_output.allocator()->allocate();
+ }
+
+ // Permute output
+ if(_needs_permute)
+ {
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ // Allocate tensors
+ _permuted_output.allocator()->allocate();
+ }
+
+ // Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activation_layer_func.configure(output, nullptr, act_info);
+ }
+
+ // Setup flip axis data
+ _flip_axis.allocator()->allocate();
+ _flip_axis.map(true);
+ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+ axis_data[0] = 0;
+ axis_data[1] = 1;
+ _flip_axis.unmap();
+}
+
+Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+
+ // Strides
+ const auto strides = conv_info.stride();
+ ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+
+ // Validate biases
+ if(biases != nullptr)
+ {
+ const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x());
+ }
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+ }
+
+ return Status{};
+}
+
+void CLFFTConvolutionLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Transform input
+ if(_needs_permute)
+ {
+ _permute_input_func.run();
+ }
+ _pad_input_func.run();
+ _transform_input_func.run();
+
+ // Perform operations to frequency domain
+ _prod_func.run();
+ _reduce_func.run();
+
+ // Transform output
+ _itransform_output_func.run();
+ _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
+ _extract_output_func.run();
+ // Add bias
+ if(_has_bias)
+ {
+ _bias_add_func.run();
+ }
+ if(_needs_permute)
+ {
+ _permute_output_func.run();
+ }
+
+ // Run activation layer
+ if(_is_activationlayer_enabled)
+ {
+ _activation_layer_func.run();
+ }
+}
+
+void CLFFTConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Permute bias to NCHW
+ if(_original_bias != nullptr)
+ {
+ _permuted_bias.allocator()->allocate();
+ _permute_bias_func.run();
+ _original_bias->mark_as_unused();
+ }
+
+ const ICLTensor *cur_weights = _original_weights;
+ // Permute weights
+ if(_needs_permute)
+ {
+ ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
+
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_func.run();
+ cur_weights->mark_as_unused();
+ cur_weights = &_permuted_weights;
+ }
+
+ // Flip weights
+ _flipped_weights.allocator()->allocate();
+ _flip_weights_func.run();
+ cur_weights->mark_as_unused();
+
+ // Pad weights
+ _padded_weights.allocator()->allocate();
+ _pad_weights_func.run();
+ _flipped_weights.mark_as_unused();
+ CLScheduler::get().queue().finish();
+ _flipped_weights.allocator()->free();
+
+ // Transform weights to frequency domain
+ _transformed_weights.allocator()->allocate();
+ _transform_weights_func->run();
+ _padded_weights.mark_as_unused();
+ CLScheduler::get().queue().finish();
+ // Delete object and release internal memory
+ _transform_weights_func.reset();
+ _padded_weights.allocator()->free();
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index d6cda91..fe2a18c 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -97,7 +97,7 @@
{
cl::CommandQueue q = CLScheduler::get().queue();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_non_max)
{
@@ -129,6 +129,4 @@
}
q.flush();
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 6a2aac6..7b9229c 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -231,7 +231,8 @@
if(_is_quantized)
{
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
_gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
_gemmlowp_output.allocator()->allocate();
@@ -333,7 +334,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
@@ -363,8 +364,6 @@
CLScheduler::get().enqueue(_accumulate_biases_kernel);
}
}
-
- _memory_group.release();
}
void CLFullyConnectedLayer::prepare()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index e91038f..492709f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -23,7 +23,10 @@
*/
#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/GPUTarget.h"
#include "arm_compute/core/Helpers.h"
@@ -33,7 +36,6 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
#include "arm_compute/runtime/ITensorAllocator.h"
namespace arm_compute
@@ -41,46 +43,6 @@
using namespace arm_compute::misc::shape_calculator;
using namespace arm_compute::cl_gemm;
-namespace
-{
-inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
-{
- bool flag = true;
-
- if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
- {
- if((m > 1) && n < 16)
- {
- flag = true;
- }
- else
- {
- // COMPMID-852
- if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
- {
- constexpr float alpha = 3.2f;
- constexpr float fact0 = 1.51f;
- constexpr float fact1 = 1.66f;
- constexpr float ops = 12.0f;
- const float scale = k > 1024 ? 1.07f : 1.0f;
- flag = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
- }
- else
- {
- flag = false;
- }
- }
- }
- else
- {
- // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
- flag = m != 1 && reshape_b_only_on_first_run;
- }
-
- return flag;
-}
-} // namespace
-
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_mm_kernel(),
@@ -88,57 +50,102 @@
_reshape_lhs_kernel(),
_reshape_rhs_kernel(),
_mm_reshaped_kernel(),
+ _mm_reshaped_only_rhs_kernel(),
_tmp_a(),
_tmp_b(),
_original_b(nullptr),
- _is_interleaved_transposed(false),
_run_addition(false),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
- _is_new_gemm_reshaped(false)
+ _gemm_type(GEMMType::NATIVE)
{
}
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+CLGEMM::GEMMType CLGEMM::select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+ GEMMType gemm_type = GEMMType::RESHAPED_V1;
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+ if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
+ {
+ if((m > 1) && (n < 16))
+ {
+ gemm_type = GEMMType::RESHAPED_V1;
+ }
+ else if((m == 1) && (data_type == DataType::F32))
+ {
+ gemm_type = GEMMType::RESHAPED_ONLY_RHS;
+ }
+ else
+ {
+ // COMPMID-852
+ if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+ {
+ constexpr float alpha = 3.2f;
+ constexpr float fact0 = 1.51f;
+ constexpr float fact1 = 1.66f;
+ constexpr float ops = 12.0f;
+ const float scale = k > 1024 ? 1.07f : 1.0f;
+ gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+ }
+ else
+ {
+ gemm_type = GEMMType::NATIVE;
+ }
+ }
- // Check if we need to reshape the matrix B only on the first run
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
- _is_prepared = gemm_info.retain_internal_weights();
- _original_b = b;
+ const auto workload = static_cast<float>((m * n) / 20.0f);
- const ICLTensor *matrix_a = a;
- const ICLTensor *matrix_b = b;
+ gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
+ }
+ else
+ {
+ // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
+ gemm_type = ((m != 1) && reshape_b_only_on_first_run) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+ }
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
+ return gemm_type;
+}
+
+void CLGEMM::configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _reshape_lhs_kernel.set_target(gpu_target);
_mm_kernel.set_target(gpu_target);
- // Arguments used by GEMMReshapeInfo
- // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
- // in order to know how the matrices have been reshaped
- DataType data_type = a->info()->data_type();
+ GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d());
+
+ // Configure and tune matrix multiply kernel
+ _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision());
+
+ // Tune kernel statically
+ CLScheduler::get().tune_kernel_static(_mm_kernel);
+}
+
+void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
const unsigned int n = b->info()->dimension(0);
const unsigned int k = a->info()->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const GPUTarget gpu_target = CLScheduler::get().target();
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
+ // Set the target for the kernels
+ _reshape_lhs_kernel.set_target(gpu_target);
+ _mm_kernel.set_target(gpu_target);
+
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
mult_transpose1xW_width = 4;
mult_interleave4x4_height = 2;
}
+
GEMMRHSMatrixInfo rhs_info;
rhs_info.n0 = 16 / b->info()->element_size();
rhs_info.k0 = 1;
@@ -153,112 +160,183 @@
lhs_info.interleave = true;
lhs_info.transpose = true;
- // Check if we need to reshape the matrix A and matrix B
- _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+ GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
- // Check if we can run the new reshaped GEMM
- const auto workload = static_cast<float>((m * n) / 20.0f);
- _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
-
- const bool add_matrix_c = (beta != 0.f && c != nullptr);
- const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
- const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
-
- // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
- if(_is_interleaved_transposed)
+ _memory_group.manage(&_tmp_a);
+ if(!_reshape_b_only_on_first_run)
{
- reinterpret_input_as_3d = false;
-
- matrix_a = &_tmp_a;
- matrix_b = &_tmp_b;
-
- // Manage intermediate buffers
- _memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
- // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
-
- if(_is_new_gemm_reshaped)
- {
- GEMMLHSMatrixInfo lhs_info;
-
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
-
- _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
-
- // Configure and tune matrix multiply kernel
- _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
- depth_output_gemm3d, reinterpret_input_as_3d));
- }
- else
- {
- // Configure interleave kernel
- _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
- // Configure transpose kernel
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
- }
+ _memory_group.manage(&_tmp_b);
}
- if(!_is_new_gemm_reshaped)
- {
- // Configure and tune matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
- GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
- gemm_info.fp_mixed_precision());
- CLScheduler::get().tune_kernel_static(_mm_kernel);
- }
+ // Configure interleave kernel
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
- if(_is_interleaved_transposed)
- {
- // Allocate intermediate tensors
- _tmp_a.allocator()->allocate();
- if(!_reshape_b_only_on_first_run)
- {
- _tmp_b.allocator()->allocate();
- }
- }
+ // Configure transpose kernel
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
- // Configure matrix addition kernel
- if(add_matrix_c && !use_fused_add)
+ // Configure and tune matrix multiply kernel
+ _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision());
+
+ CLScheduler::get().tune_kernel_static(_mm_kernel);
+
+ // Allocate intermediate tensors
+ _tmp_a.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
{
- _ma_kernel.configure(c, output, beta);
- _run_addition = true;
+ _tmp_b.allocator()->allocate();
}
}
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON(c != nullptr);
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_UNUSED(c);
+
+ DataType data_type = a->info()->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Set the target for the kernels
+ _reshape_lhs_kernel.set_target(gpu_target);
+ _mm_kernel.set_target(gpu_target);
+
+ GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
+ // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
+
+ GEMMLHSMatrixInfo lhs_info{};
+ GEMMRHSMatrixInfo rhs_info{};
+
+ // Pick up the GEMM configuration
+ std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+ // Configure lhs_info and rhs_info
+ std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+ // Allocate intermediate tensors
+ _tmp_a.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
+}
+
+void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON(c != nullptr);
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_UNUSED(c);
+
+ DataType data_type = a->info()->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Set the target for the kernels
+ _mm_kernel.set_target(gpu_target);
+
+ GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+ // Manage intermediate buffers
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
+
+ GEMMLHSMatrixInfo lhs_info{};
+ GEMMRHSMatrixInfo rhs_info{};
+
+ // Pick up the GEMM configuration
+ std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+ // Configure lhs_info and rhs_info
+ std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
+}
+
+Status CLGEMM::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_UNUSED(alpha);
ARM_COMPUTE_UNUSED(output);
- // Check if we need to reshape the matrix B only on the first run
- const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool add_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
- const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+ false, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+
+ if(add_c && !fuse_add)
+ {
+ // Validate matrix addition kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+ }
+
+ return Status{};
+}
+
+Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
// Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- // Arguments used by GEMMReshapeInfo
- // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
- // in order to know how the matrices have been reshaped
- DataType data_type = a->data_type();
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
const unsigned int n = b->dimension(0);
const unsigned int k = a->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool add_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
@@ -280,66 +358,21 @@
lhs_info.interleave = true;
lhs_info.transpose = true;
- // Check if we need to reshape the matrix A and matrix B
- const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
- // Check if we can run the new reshaped GEMM
- const auto workload = static_cast<float>((m * n) / 20.0f);
- const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
- const bool add_matrix_c = (beta != 0.f && c != nullptr);
- const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
- const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
- // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
- if(run_interleave_transpose)
- {
- reinterpret_input_as_3d = false;
- }
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+ true, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
-
- if(run_interleave_transpose)
- {
- matrix_a_info = &tmp_a_info;
- matrix_b_info = &tmp_b_info;
-
- if(is_new_gemm_reshaped)
- {
- GEMMLHSMatrixInfo lhs_info;
-
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
-
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
- depth_output_gemm3d, reinterpret_input_as_3d)));
- }
- else
- {
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
- // Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
- }
- }
-
- if(!is_new_gemm_reshaped)
- {
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
- run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
- }
-
- if(add_matrix_c && !use_fused_add)
+ if(add_c && !fuse_add)
{
// Validate matrix addition kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -348,32 +381,263 @@
return Status{};
}
+Status CLGEMM::validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool add_c = (beta != 0.f && c != nullptr);
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+ GEMMLHSMatrixInfo lhs_info;
+ GEMMRHSMatrixInfo rhs_info;
+
+ // Pick up the GEMM configuration
+ std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+ // Configure lhs_info and rhs_info
+ std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+ if(add_c)
+ {
+ // Validate matrix addition kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+ }
+
+ return Status{};
+}
+
+Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
+
+ TensorInfo tmp_b_info{};
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool add_c = (beta != 0.f && c != nullptr);
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+ GEMMLHSMatrixInfo lhs_info;
+ GEMMRHSMatrixInfo rhs_info;
+
+ // Pick up the GEMM configuration
+ std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+ // Configure lhs_info and rhs_info
+ std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+ if(add_c)
+ {
+ // Validate matrix addition kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+ }
+
+ return Status{};
+}
+
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+ // Check if we need to reshape the matrix B only on the first run
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _is_prepared = gemm_info.retain_internal_weights();
+ _original_b = b;
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+
+ // Select GEMMType
+ _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+
+ const bool is_gemm_v2 = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS);
+ const bool add_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool fuse_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !is_gemm_v2;
+
+ switch(_gemm_type)
+ {
+ case GEMMType::NATIVE:
+ {
+ configure_native(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ break;
+ }
+ case GEMMType::RESHAPED_V1:
+ {
+ configure_reshaped_v1(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ break;
+ }
+ case GEMMType::RESHAPED_V2:
+ {
+ configure_reshaped_v2(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ break;
+ }
+ case GEMMType::RESHAPED_ONLY_RHS:
+ {
+ configure_reshaped_only_rhs(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("GEMMType not supported");
+ }
+ }
+
+ // Configure matrix addition kernel
+ if(add_c && !fuse_add)
+ {
+ _ma_kernel.configure(c, output, beta);
+ _run_addition = true;
+ }
+}
+
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+
+ // Select GEMMType
+ GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
+
+ switch(gemm_type)
+ {
+ case GEMMType::NATIVE:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c, output, alpha, beta, gemm_info));
+ break;
+ }
+ case GEMMType::RESHAPED_V1:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c, output, alpha, beta, gemm_info));
+ break;
+ }
+ case GEMMType::RESHAPED_V2:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c, output, alpha, beta, gemm_info));
+ break;
+ }
+ case GEMMType::RESHAPED_ONLY_RHS:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c, output, alpha, beta, gemm_info));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
+ }
+ }
+
+ return Status{};
+}
+
void CLGEMM::run()
{
prepare();
- _memory_group.acquire();
-
- if(_is_interleaved_transposed)
- {
- // Run interleave kernel
- CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
-
- if(!_reshape_b_only_on_first_run)
- {
- // Run transpose kernel
- CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
- }
- }
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run matrix multiply kernel
- if(_is_new_gemm_reshaped)
+ switch(_gemm_type)
{
- CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
- }
- else
- {
- CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ case GEMMType::NATIVE:
+ {
+ CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ break;
+ }
+ case GEMMType::RESHAPED_V1:
+ {
+ // Run interleave kernel
+ CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ }
+
+ CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ break;
+ }
+ case GEMMType::RESHAPED_V2:
+ {
+ // Run interleave kernel
+ CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ }
+
+ CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+ break;
+ }
+ case GEMMType::RESHAPED_ONLY_RHS:
+ {
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ }
+
+ CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("GEMMType not supported");
+ }
}
// Run matrix addition kernel
@@ -381,15 +645,13 @@
{
CLScheduler::get().enqueue(_ma_kernel);
}
-
- _memory_group.release();
}
void CLGEMM::prepare()
{
if(!_is_prepared)
{
- if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+ if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
{
// Run transpose kernel and mark original weights tensor as unused
_tmp_b.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 7105e85..03d516f 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -262,7 +262,7 @@
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, conv_w * conv_h);
- // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
+ // TODO(COMPMID-2078): input->clone() doesn't work with subtensors for grouped convolutions.
TensorInfo info_gemm(shape_gemm, 1, data_type);
info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
_gemm_output.allocator()->init(info_gemm);
@@ -372,7 +372,9 @@
const unsigned int kernel_width = weights->dimension(idx_width);
const unsigned int kernel_height = weights->dimension(idx_height);
- TensorInfo im2col_reshaped_info, info_gemm, weights_reshaped_info;
+ TensorInfo im2col_reshaped_info{};
+ TensorInfo info_gemm{};
+ TensorInfo weights_reshaped_info{};
const ITensorInfo *gemm_input_to_use = input;
const ITensorInfo *gemm_output_to_use = output;
const ITensorInfo *weights_to_use = weights;
@@ -526,7 +528,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run im2col
if(!_skip_im2col)
@@ -562,8 +564,6 @@
{
_activationlayer_function.run();
}
-
- _memory_group.release();
}
void CLGEMMConvolutionLayer::prepare()
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
new file mode 100644
index 0000000..bcb91e0
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "utils/TypePrinter.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+{
+ Coordinates start;
+ Coordinates end;
+
+ if(is_nchw)
+ {
+ start.set(0, deconv_info.pad_left());
+ start.set(1, deconv_info.pad_top());
+ end.set(0, output_info.dimension(0) - deconv_info.pad_right());
+ end.set(1, output_info.dimension(1) - deconv_info.pad_bottom());
+ }
+ else
+ {
+ start.set(0, 0);
+ start.set(1, deconv_info.pad_left());
+ start.set(2, deconv_info.pad_top());
+
+ end.set(0, output_info.dimension(0));
+ end.set(1, output_info.dimension(1) - deconv_info.pad_right());
+ end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
+ }
+
+ return { start, end };
+}
+} // namespace
+
+CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _mm_gemm(),
+ _mm_gemmlowp(),
+ _gemmlowp_output_stage(),
+ _permute_input_to_nhwc(),
+ _permute_weights_to_nhwc(),
+ _reshape_weights(),
+ _transpose_weights(),
+ _deconv_reshape(),
+ _slice_gemm(),
+ _gemmlowp_final(),
+ _reshaped_weights(),
+ _reshaped_weights_t(),
+ _permuted_input(),
+ _permuted_weights(),
+ _gemm_output(),
+ _slice_gemm_input(),
+ _original_weights(),
+ _is_prepared(false),
+ _padded_input(false),
+ _is_nchw(false),
+ _is_quantized(false)
+{
+}
+
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ DataLayout data_layout = input->data_layout();
+ const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+ const bool is_nchw = input->data_layout() == DataLayout::NCHW;
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_b = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != deconv_info.stride().first);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) != deconv_info.stride().second);
+
+ TensorShape nhwc_weights_shape = weights->tensor_shape();
+ TensorShape nhwc_input_shape = input->tensor_shape();
+
+ if(is_nchw)
+ {
+ permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
+ permute(nhwc_input_shape, PermutationVector(2, 0, 1));
+
+ TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+
+ TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+
+ CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
+ CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
+ }
+
+ const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+ const TensorInfo reshaped_info = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
+
+ TensorShape transposed_shape(reshaped_shape[1], reshaped_shape[0]);
+ const TensorInfo reshaped_t_info = reshaped_info.clone()->set_is_resizable(true).set_tensor_shape(transposed_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
+
+ TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
+ input->dimension(idx_w),
+ input->dimension(idx_h),
+ input->dimension(idx_b));
+
+ TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
+ GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true);
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
+ gemm_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+ }
+
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+ 0, 0, deconv_info.stride().first, deconv_info.stride().second);
+ const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+ TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+
+ if(padded_input && is_quantized)
+ {
+ const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr,
+ &col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8), output, start_end.first, start_end.second));
+ }
+ else if(padded_input)
+ {
+ const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
+ }
+ else if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr, output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+ }
+
+ return Status{};
+}
+
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
+ weights->info(),
+ bias != nullptr ? bias->info() : nullptr,
+ output->info(),
+ deconv_info));
+
+ _original_weights = weights;
+ _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+
+ const ICLTensor *input_to_use = input;
+ const ICLTensor *weights_to_use = weights;
+
+ // If the data layout is NCHW, transform everything in NHWC. Another alternative could be to
+ // do an outer product in NCHW and then an accumulation through a reduction. This would have two
+ // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
+ // might be slower than GEMM.
+ if(_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+
+ _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ }
+
+ // Reshape the input weights. The weights will be reshaped only once during the call to prepare()
+ _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
+ weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
+ 1,
+ input->info()->data_type(), weights->info()->quantization_info()));
+
+ _reshape_weights.configure(weights_to_use, &_reshaped_weights);
+ _transpose_weights.configure(&_reshaped_weights, &_reshaped_weights_t);
+
+ const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true);
+
+ // Configure output stage for asymmetric quantized types
+ if(_is_quantized)
+ {
+ _mm_gemmlowp.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
+ }
+ else
+ {
+ _mm_gemm.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+ }
+
+ if(_is_nchw)
+ {
+ _permuted_input.allocator()->allocate();
+ }
+
+ ICLTensor *deconv_reshape_output = nullptr;
+ ICLTensor *slice_output = nullptr;
+ ICLTensor *output_stage_output = nullptr;
+
+ if(_padded_input && _is_quantized)
+ {
+ _memory_group.manage(&_slice_gemm_input);
+ _memory_group.manage(&_gemmlowp_final);
+ deconv_reshape_output = &_gemmlowp_final;
+ output_stage_output = &_slice_gemm_input;
+ slice_output = output;
+ }
+ else if(_padded_input)
+ {
+ _memory_group.manage(&_slice_gemm_input);
+ deconv_reshape_output = &_slice_gemm_input;
+ slice_output = output;
+ }
+ else if(_is_quantized)
+ {
+ _memory_group.manage(&_gemmlowp_final);
+ deconv_reshape_output = &_gemmlowp_final;
+ output_stage_output = output;
+ }
+ else
+ {
+ deconv_reshape_output = output;
+ }
+
+ // Configure a Col2Im call to reshape the output of GEMM
+ _deconv_reshape.configure(&_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+ _gemm_output.allocator()->allocate();
+
+ if(_is_quantized)
+ {
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / _gemmlowp_final.info()->quantization_info().scale;
+ int output_multiplier(0);
+ int output_shift(0);
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, _gemmlowp_final.info()->quantization_info().offset);
+ _gemmlowp_final.allocator()->allocate();
+ }
+
+ // If the input was padded, the output needs to be sliced.
+ if(_padded_input)
+ {
+ const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+ _slice_gemm.configure(&_slice_gemm_input, slice_output, start_end.first, start_end.second);
+ _slice_gemm_input.allocator()->allocate();
+ }
+}
+
+void CLGEMMDeconvolutionLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ if(_is_nchw)
+ {
+ _permute_input_to_nhwc.run();
+ }
+
+ if(_is_quantized)
+ {
+ _mm_gemmlowp.run();
+ }
+ else
+ {
+ _mm_gemm.run();
+ }
+
+ CLScheduler::get().enqueue(_deconv_reshape, false);
+
+ if(_is_quantized)
+ {
+ _gemmlowp_output_stage.run();
+ }
+
+ if(_padded_input)
+ {
+ _slice_gemm.run();
+ }
+}
+
+void CLGEMMDeconvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ if(_is_nchw)
+ {
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_to_nhwc.run();
+ }
+
+ _reshaped_weights.allocator()->allocate();
+ _reshape_weights.run();
+
+ if(_is_nchw)
+ {
+ _permuted_weights.allocator()->free();
+ }
+
+ _reshaped_weights_t.allocator()->allocate();
+ _transpose_weights.run();
+
+ // Prepare gemm
+ if(!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ }
+ else
+ {
+ _mm_gemmlowp.prepare();
+ }
+
+ // Free resources
+ if(!_reshaped_weights_t.is_used())
+ {
+ _reshaped_weights_t.allocator()->free();
+ }
+
+ _original_weights->mark_as_unused();
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2a01db7..049db1d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
@@ -31,7 +32,6 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
namespace arm_compute
{
@@ -40,17 +40,16 @@
namespace
{
-inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
- return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
+ return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
}
} // namespace
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_mm_kernel(),
- _mm_reshaped_kernel(),
- _mtx_a_reshape_kernel(),
+ _mm_reshaped_only_rhs_kernel(),
_mtx_b_reshape_kernel(),
_mtx_a_reduction_kernel(),
_mtx_b_reduction_kernel(),
@@ -58,7 +57,6 @@
_offset_contribution_output_stage_kernel(),
_vector_sum_col(),
_vector_sum_row(),
- _tmp_a(),
_tmp_b(),
_mm_result_s32(),
_original_b(nullptr),
@@ -86,7 +84,6 @@
const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _mtx_a_reshape_kernel.set_target(gpu_target);
_mm_kernel.set_target(gpu_target);
const ICLTensor *matrix_a = a;
@@ -105,29 +102,21 @@
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
// Check if we need to reshape the matrix A and matrix B
- _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
+ _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
if(_is_gemm_reshaped)
{
- // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
- reinterpret_input_as_3d = false;
-
- matrix_a = &_tmp_a;
matrix_b = &_tmp_b;
- _memory_group.manage(&_tmp_a);
if(!_reshape_b_only_on_first_run)
{
_memory_group.manage(&_tmp_b);
}
// Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
- // Configure interleave kernel
- _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-
- // Configure transpose kernel
+ // Configure reshape RHS kernel
_mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
}
@@ -166,7 +155,7 @@
if(_is_gemm_reshaped)
{
// Configure and tune matrix multiply kernel
- _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
else
{
@@ -185,7 +174,7 @@
if(_is_gemm_reshaped)
{
// Configure and tune matrix multiply kernel
- _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
else
{
@@ -200,7 +189,6 @@
// Allocate tensors
if(_is_gemm_reshaped)
{
- _tmp_a.allocator()->allocate();
if(!_reshape_b_only_on_first_run)
{
_tmp_b.allocator()->allocate();
@@ -231,11 +219,13 @@
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
- TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
GEMMRHSMatrixInfo rhs_info;
GEMMLHSMatrixInfo lhs_info;
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
const unsigned int n = b->dimension(0);
@@ -243,35 +233,24 @@
const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
-
- // if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
- if(reshape_matrices)
- {
- reinterpret_input_as_3d = false;
- }
+ bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
- if(reshape_matrices)
+ if(reshape_matrix_b)
{
- matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
// Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
- // Validate transpose kernel
-
+ // Validate reshape RHS kernel
auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
}
- TensorInfo info_vector_sum_col, info_vector_sum_row;
+ TensorInfo info_vector_sum_col{};
+ TensorInfo info_vector_sum_row{};
// Validate matrix B reduction kernel only if _a_offset is not equal to 0
if(a_offset != 0)
@@ -295,13 +274,13 @@
{
TensorInfo mm_result_s32_info{};
- if(reshape_matrices)
+ if(reshape_matrix_b)
{
// Output tensor auto inizialitation if not yet initialized
auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
}
else
{
@@ -322,22 +301,25 @@
}
else
{
- if(reshape_matrices)
+ if(reshape_matrix_b)
{
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
}
else
{
// Validate matrix multiply
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
}
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- a_offset, b_offset));
+ if(output->total_size() != 0)
+ {
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ a_offset, b_offset));
+ }
}
return Status{};
@@ -347,13 +329,10 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_is_gemm_reshaped)
{
- // Run reshape matrix A
- CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
-
if(!_reshape_b_only_on_first_run)
{
// Run reshape matrix B
@@ -370,7 +349,7 @@
// Run matrix multiply
if(_is_gemm_reshaped)
{
- CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+ CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
}
else
{
@@ -393,8 +372,6 @@
// Run offset contribution kernel
CLScheduler::get().enqueue(_offset_contribution_kernel, true);
}
-
- _memory_group.release();
}
void CLGEMMLowpMatrixMultiplyCore::prepare()
@@ -422,4 +399,4 @@
_is_prepared = true;
}
}
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index f30eee1..ea803e4 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,10 +62,8 @@
{
CLScheduler::get().enqueue(_border_handler, false);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
CLScheduler::get().enqueue(_kernel_hor, false);
CLScheduler::get().enqueue(_kernel_vert);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index fd82769..b671b23 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -76,10 +76,10 @@
if(num_levels > 1)
{
- _horizontal_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
- _vertical_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+ _horizontal_border_handler.resize(num_levels - 1);
+ _vertical_border_handler.resize(num_levels - 1);
+ _horizontal_reduction.resize(num_levels - 1);
+ _vertical_reduction.resize(num_levels - 1);
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -153,8 +153,8 @@
if(num_levels > 1)
{
- _gauss5x5 = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
- _scale_nearest = arm_compute::support::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+ _gauss5x5.resize(num_levels - 1);
+ _scale_nearest.resize(num_levels - 1);
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index c50132e..d712a23 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -256,7 +256,7 @@
void CLGenerateProposalsLayer::run()
{
// Acquire all the temporaries
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Compute all the anchors
CLScheduler::get().enqueue(_compute_anchors_kernel, false);
@@ -277,8 +277,5 @@
// Add dummy batch indexes
CLScheduler::get().enqueue(_memset_kernel, true);
CLScheduler::get().enqueue(_padded_copy_kernel, true);
-
- // Release all the temporaries
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index 1470d5c..0931443 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -95,7 +95,7 @@
void CLHOGDescriptor::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run gradient
_gradient.run();
@@ -105,6 +105,4 @@
// Run block normalization
CLScheduler::get().enqueue(_block_norm);
-
- _memory_group.release();
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 51aeaed..e509fd8 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -71,13 +71,11 @@
void CLHOGGradient::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run derivative
_derivative.run();
// Run magnitude/phase kernel
CLScheduler::get().enqueue(_mag_phase);
-
- _memory_group.release();
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 8012c2f..f799d61 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -128,12 +128,11 @@
_num_block_norm_kernel = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
_num_hog_detect_kernel = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
- _orient_bin_kernel = arm_compute::support::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
- _block_norm_kernel = arm_compute::support::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
- _hog_detect_kernel = arm_compute::support::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
- _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
- _hog_space = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
- _hog_norm_space = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+ _orient_bin_kernel.resize(_num_orient_bin_kernel);
+ _block_norm_kernel.resize(_num_block_norm_kernel);
+ _hog_detect_kernel.resize(_num_hog_detect_kernel);
+ _hog_space.resize(_num_orient_bin_kernel);
+ _hog_norm_space.resize(_num_block_norm_kernel);
// Allocate tensors for magnitude and phase
TensorInfo info_mag(shape_img, Format::S16);
@@ -172,10 +171,10 @@
_hog_space[i].allocator()->init(info_space);
// Manage intermediate buffers
- _memory_group.manage(_hog_space.get() + i);
+ _memory_group.manage(&_hog_space[i]);
// Initialise orientation binning kernel
- _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ _orient_bin_kernel[i].configure(&_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
}
// Allocate intermediate tensors
@@ -193,10 +192,10 @@
_hog_norm_space[i].allocator()->init(tensor_info);
// Manage intermediate buffers
- _memory_group.manage(_hog_norm_space.get() + i);
+ _memory_group.manage(&_hog_norm_space[i]);
// Initialize block normalization kernel
- _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ _block_norm_kernel[i].configure(&_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
}
// Allocate intermediate tensors
@@ -212,13 +211,13 @@
{
const size_t idx_block_norm = input_hog_detect[i];
- _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+ _hog_detect_kernel[i].configure(&_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
}
detection_window_strides->unmap(CLScheduler::get().queue());
// Configure non maxima suppression kernel
- _non_maxima_kernel->configure(_detection_windows, min_distance);
+ _non_maxima_kernel.configure(_detection_windows, min_distance);
// Allocate intermediate tensors
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
@@ -231,7 +230,7 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Reset detection window
_detection_windows->clear();
@@ -242,13 +241,13 @@
// Run orientation binning kernel
for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
{
- CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+ CLScheduler::get().enqueue(_orient_bin_kernel[i], false);
}
// Run block normalization kernel
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
- CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+ CLScheduler::get().enqueue(_block_norm_kernel[i], false);
}
// Run HOG detector kernel
@@ -262,9 +261,7 @@
{
// Map detection windows array before computing non maxima suppression
_detection_windows->map(CLScheduler::get().queue(), true);
- Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
+ Scheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
_detection_windows->unmap(CLScheduler::get().queue());
}
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 65ce7de..67f550d3 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,7 +55,7 @@
_gy(),
_score(),
_nonmax(),
- _corners_list(nullptr),
+ _corners_list(),
_num_corner_candidates(0),
_corners(nullptr)
{
@@ -84,7 +84,7 @@
_score.allocator()->init(info_f32);
_nonmax.allocator()->init(info_f32);
- _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+ _corners_list.resize(shape.x() * shape.y());
// Manage intermediate buffers
_memory_group.manage(&_gx);
@@ -146,20 +146,20 @@
_score.allocator()->allocate();
// Init corner candidates kernel
- _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+ _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
// Allocate intermediate buffers
_nonmax.allocator()->allocate();
// Init euclidean distance
- _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+ _sort_euclidean.configure(_corners_list.data(), _corners, &_num_corner_candidates, min_dist);
}
void CLHarrisCorners::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Init to 0 number of corner candidates
_num_corner_candidates = 0;
@@ -185,6 +185,4 @@
_corners->map(CLScheduler::get().queue(), true);
Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
_corners->unmap(CLScheduler::get().queue());
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 2e3c6d7..136cb5e 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,11 +74,9 @@
void CLL2NormalizeLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_reduce_func.run();
CLScheduler::get().enqueue(_normalize_kernel, true);
-
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index f01b1b8..4606a66 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -43,10 +43,11 @@
_pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
_pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
_accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
- _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
- _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
- _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
- _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+ _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+ _ones_memset_kernel(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
+ _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
+ _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false),
+ _is_prepared(false)
{
}
@@ -93,25 +94,38 @@
lstm_params_info, activation_info, cell_threshold, projection_threshold));
const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
-
// Configure block that calculates the forget gate
// forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
- TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ // We optimize this as follows:
+ // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias
_forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
_forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _memory_group.manage(&_forget_gate_out1);
- _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+ std::vector<const ICLTensor *> inputs_vector;
+ inputs_vector.emplace_back(input);
+ inputs_vector.emplace_back(output_state_in);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+ _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type()));
+
_memory_group.manage(&_forget_gate_out2);
- _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
- _memory_group.manage(&_forget_gate_out3);
- _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
- _forget_gate_out2.allocator()->allocate();
+ _concat_inputs_forget_gate.configure(input, output_state_in, &_forget_gate_out2);
+
+ std::vector<const ICLTensor *> weights_vector;
+
+ weights_vector.emplace_back(input_to_forget_weights);
+ weights_vector.emplace_back(recurrent_to_forget_weights);
+ const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+ _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
+
+ _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
+
_memory_group.manage(&_forget_gate_out5);
- _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
- _forget_gate_out1.allocator()->allocate();
+ _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+ _memory_group.manage(&_forget_gate_out1);
+ _memory_group.manage(&_forget_gate_out3);
+ _forget_gate_out6.allocator()->allocate();
+
CLTensor *forget_gate_out = &_forget_gate_out5;
if(lstm_params.has_peephole_opt())
{
@@ -134,43 +148,46 @@
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
// input_gate = 1 - forget_gate, with CIFG
+ // We optimize this as follows:
+ // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
CLTensor *input_gate_out = &_input_gate_out1;
if(lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _ones_memset_kernel.configure(&_ones, PixelValue(1, _ones.info()->data_type()));
_subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
else
{
- TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-
- _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
_input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ std::vector<const ICLTensor *> lstm_weights;
+ lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+ lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+ TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
+
+ _concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
_memory_group.manage(&_input_gate_out1);
- _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
- _memory_group.manage(&_input_gate_out2);
- _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+
_memory_group.manage(&_input_gate_out3);
- _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+ _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
_input_gate_out2.allocator()->allocate();
- _memory_group.manage(&_input_gate_out4);
- _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
- _input_gate_out3.allocator()->allocate();
- input_gate_out = &_input_gate_out4;
+
+ input_gate_out = &_input_gate_out3;
if(_run_peephole_opt)
{
- _memory_group.manage(&_input_gate_out5);
- _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _memory_group.manage(&_input_gate_out4);
+ _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
- _input_gate_out5.allocator()->allocate();
input_gate_out = &_input_gate_out1;
}
else
@@ -215,35 +232,39 @@
// Configure block that calculates the output
// output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
- TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ // We optimize this as follows:
+ // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
_output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
- _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ std::vector<const ICLTensor *> in_out_weights;
+ in_out_weights.emplace_back(input_to_output_weights);
+ in_out_weights.emplace_back(recurrent_to_output_weights);
+ TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
+
+ _concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2);
_memory_group.manage(&_output1);
- _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
- _memory_group.manage(&_output2);
- _transpose_output.configure(recurrent_to_output_weights, &_output2);
- _memory_group.manage(&_output3);
- _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+ _memory_group.manage(&_output4);
+
+ _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+
_output2.allocator()->allocate();
- _memory_group.manage(&_output5);
- _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
- _output3.allocator()->allocate();
- CLTensor *output_gate_out = &_output5;
+ _forget_gate_out2.allocator()->allocate();
+
+ CLTensor *output_gate_out = &_output4;
if(lstm_params.has_peephole_opt())
{
- _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+ _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
- _memory_group.manage(&_output4);
- _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
- _output5.allocator()->allocate();
+ _memory_group.manage(&_output3);
+ _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+ _output4.allocator()->allocate();
output_gate_out = &_output1;
// Allocate intermediate buffers
- _output4.allocator()->allocate();
+ _output3.allocator()->allocate();
}
else
{
@@ -369,8 +390,15 @@
// Validate forget gate
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+
+ std::vector<const ITensorInfo *> inputs_vector;
+ inputs_vector.emplace_back(input);
+ inputs_vector.emplace_back(output_state_in);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+ TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -388,9 +416,15 @@
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
+ std::vector<const ITensorInfo *> lstm_weights;
+ lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+ lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+ TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat));
+
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
@@ -419,10 +453,15 @@
cell_threshold)));
}
+ std::vector<const ITensorInfo *> in_out_weights;
+ in_out_weights.emplace_back(input_to_output_weights);
+ in_out_weights.emplace_back(recurrent_to_output_weights);
+ TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat));
// Validate output gate tmp
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
@@ -464,12 +503,13 @@
void CLLSTMLayer::run()
{
- _memory_group.acquire();
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ CLScheduler::get().enqueue(_concat_inputs_forget_gate);
_fully_connected_forget_gate.run();
- CLScheduler::get().enqueue(_transpose_forget_gate);
- _gemm_forget_gate.run();
- CLScheduler::get().enqueue(_accum_forget_gate1);
if(_run_peephole_opt)
{
@@ -480,24 +520,13 @@
if(_run_cifg_opt)
{
- _ones.map(true);
- if(_ones.info()->data_type() == DataType::F16)
- {
- std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
- }
- else
- {
- std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
- }
- _ones.unmap();
+ CLScheduler::get().enqueue(_ones_memset_kernel);
CLScheduler::get().enqueue(_subtract_input_gate);
}
else
{
_fully_connected_input_gate.run();
- CLScheduler::get().enqueue(_transpose_input_gate);
- _gemm_input_gate.run();
- CLScheduler::get().enqueue(_accum_input_gate1);
+
if(_run_peephole_opt)
{
CLScheduler::get().enqueue(_pixelwise_mul_input_gate);
@@ -521,9 +550,6 @@
}
_fully_connected_output.run();
- CLScheduler::get().enqueue(_transpose_output);
- _gemm_output.run();
- CLScheduler::get().enqueue(_accum_output1);
if(_run_peephole_opt)
{
@@ -548,6 +574,18 @@
CLScheduler::get().enqueue(_copy_output);
_concat_scratch_buffer.run();
+}
- _memory_group.release();
+void CLLSTMLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ CLScheduler::get().enqueue(_concat_weights_forget_gate);
+ if(!_run_cifg_opt)
+ {
+ CLScheduler::get().enqueue(_concat_weights_input_gate);
+ }
+ CLScheduler::get().enqueue(_concat_weights_output);
+ _is_prepared = true;
+ }
}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 559b57f..a118518 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,8 +70,8 @@
// Create Gaussian Pyramid function
_gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
- _convf = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
- _subf = arm_compute::support::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+ _convf.resize(_num_levels);
+ _subf.resize(_num_levels);
for(unsigned int i = 0; i < _num_levels; ++i)
{
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 911c9b3..13116bf 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,8 +63,8 @@
_tmp_pyr.init(pyramid_info);
// Allocate add and scale functions. Level 0 does not need to be scaled.
- _addf = arm_compute::support::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
- _scalef = arm_compute::support::cpp14::make_unique<CLScale[]>(num_levels - 1);
+ _addf.resize(num_levels);
+ _scalef.resize(num_levels - 1);
const size_t last_level = num_levels - 1;
@@ -85,7 +85,7 @@
void CLLaplacianReconstruct::run()
{
- ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+ ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 5c6bef9..3e99dde 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -176,7 +176,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run input reshaping
CLScheduler::get().enqueue(_input_im2col_kernel);
@@ -186,8 +186,6 @@
// Reshape output matrix
CLScheduler::get().enqueue(_output_col2im_kernel, false);
-
- _memory_group.release();
}
void CLLocallyConnectedLayer::prepare()
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 157f306..8517b59 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -104,7 +104,7 @@
template <typename T>
void CLMeanStdDev::run_float()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Perform reduction on x-axis
_reduction_operation_mean.run();
@@ -140,8 +140,6 @@
_reduction_output_stddev.unmap();
}
_reduction_output_mean.unmap();
-
- _memory_group.release();
}
void CLMeanStdDev::run_int()
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index d00b1b5..a013a1f 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -84,12 +84,12 @@
const int old_values_list_length = list_length * window_dimension * window_dimension;
// Create kernels and tensors
- _tracker_init_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
- _tracker_stage0_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
- _tracker_stage1_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
- _func_scharr = arm_compute::support::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
- _scharr_gx = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
- _scharr_gy = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
+ _tracker_init_kernel.resize(_num_levels);
+ _tracker_stage0_kernel.resize(_num_levels);
+ _tracker_stage1_kernel.resize(_num_levels);
+ _func_scharr.resize(_num_levels);
+ _scharr_gx.resize(_num_levels);
+ _scharr_gy.resize(_num_levels);
// Create internal keypoint arrays
_old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
@@ -118,8 +118,8 @@
_scharr_gy[i].allocator()->init(tensor_info);
// Manage intermediate buffers
- _memory_group.manage(_scharr_gx.get() + i);
- _memory_group.manage(_scharr_gy.get() + i);
+ _memory_group.manage(&_scharr_gx[i]);
+ _memory_group.manage(&_scharr_gy[i]);
// Init Scharr kernel
_func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
@@ -149,7 +149,7 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
for(unsigned int level = _num_levels; level > 0; --level)
{
@@ -167,6 +167,4 @@
}
CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 3aa1b1e..99e3121 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -25,39 +25,293 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "support/ToolchainSupport.h"
namespace arm_compute
{
CLPadLayer::CLPadLayer()
- : _copy_kernel(), _fillborder_kernel(), _memset_kernel()
+ : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
{
}
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
+void CLPadLayer::configure_constant_mode(ICLTensor *input, ICLTensor *output, const PaddingList &padding, const PixelValue constant_value)
{
- // Copy the input to the output
- _copy_kernel.configure(input, output, padding);
-
- // Set the pages of the output to zero
+ // Set the pages of the output to the constant_value.
_memset_kernel.configure(output, constant_value);
- // Fill padding on the first two dimensions with zeros
- _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
+ // Fill out padding list with zeroes.
+ PaddingList padding_extended = padding;
+ for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+ {
+ padding_extended.emplace_back(PaddingInfo{ 0, 0 });
+ }
+
+ // Create a window within the output tensor where the input will be copied.
+ Window copy_window = Window();
+ for(uint32_t i = 0; i < output->info()->num_dimensions(); ++i)
+ {
+ copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->info()->dimension(i), 1));
+ }
+ // Copy the input to the output, leaving the padding filled with the constant_value.
+ _copy_kernel.configure(input, output, PaddingList(), ©_window);
}
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+void CLPadLayer::configure_reflect_symmetric_mode(ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
- ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
+ int64_t last_padding_dimension = _padding.size() - 1;
+ // Reflecting can be performed by effectively unfolding the input as follows:
+ // For each dimension starting at DimX:
+ // Create a before and after slice, which values depend on the selected padding mode
+ // Concatenate the before and after padding with the tensor to be padded
+ // Two strided slice functions will be required for each dimension padded as well as a
+ // concatenate function and the tensors to hold the temporary results.
+ _slice_functions.resize(2 * _num_dimensions);
+ _slice_results.resize(2 * _num_dimensions);
+ _concat_functions.resize(_num_dimensions);
+ _concat_results.resize(_num_dimensions - 1);
+
+ Coordinates starts_before{};
+ Coordinates ends_before{};
+ Coordinates starts_after{};
+ Coordinates ends_after{};
+ Coordinates strides{};
+ ICLTensor *prev = input;
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
+ if(i > 0)
+ {
+ strides.set(i - 1, 1);
+ }
+
+ if(_padding[i].first > 0 || _padding[i].second > 0)
+ {
+ // Set the starts, ends, and strides values for the current dimension.
+ // Due to the bit masks passed to strided slice, the values below the current dimension in
+ // starts and ends will be ignored so do not need to be modified.
+ if(_mode == PaddingMode::REFLECT)
+ {
+ starts_before.set(i, _padding[i].first);
+ ends_before.set(i, 0);
+ starts_after.set(i, input->info()->dimension(i) - 2);
+ ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
+ strides.set(i, -1);
+ }
+ else
+ {
+ starts_before.set(i, _padding[i].first - 1);
+ ends_before.set(i, -1);
+ starts_after.set(i, input->info()->dimension(i) - 1);
+ ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
+ strides.set(i, -1);
+ }
+
+ // Strided slice wraps negative indexes around to the end of the range,
+ // instead this should indicate use of the full range and so the bit mask will be modified.
+ const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t end_mask_before = ends_before[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t begin_mask_after = starts_after[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t end_mask_after = ends_after[i] < 0 ? ~0 : ~(1u << i);
+
+ // Reflect the input values for the padding before and after the input.
+ std::vector<ICLTensor *> concat_vector;
+ if(_padding[i].first > 0)
+ {
+ if(i < prev->info()->num_dimensions())
+ {
+ _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+ concat_vector.push_back(&_slice_results[2 * i]);
+ }
+ else
+ {
+ // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+ concat_vector.push_back(prev);
+ }
+ }
+ concat_vector.push_back(prev);
+ if(_padding[i].second > 0)
+ {
+ if(i < prev->info()->num_dimensions())
+ {
+ _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+ concat_vector.push_back(&_slice_results[2 * i + 1]);
+ }
+ else
+ {
+ // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+ concat_vector.push_back(prev);
+ }
+ }
+ // Concatenate the padding before and after with the input.
+ ICLTensor *out = (static_cast<int32_t>(i) == last_padding_dimension) ? output : &_concat_results[i];
+ _concat_functions[i].configure(concat_vector, out, i);
+ prev = out;
+ }
+ }
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ if((static_cast<int32_t>(i) != last_padding_dimension))
+ {
+ _concat_results[i].allocator()->allocate();
+ }
+ _slice_results[2 * i].allocator()->allocate();
+ _slice_results[2 * i + 1].allocator()->allocate();
+ }
+}
+
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+
+ _padding = padding;
+ _mode = mode;
+
+ TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
+
+ // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
+ int64_t last_padding_dimension = _padding.size() - 1;
+ for(; last_padding_dimension >= 0; --last_padding_dimension)
+ {
+ if(_padding[last_padding_dimension].first > 0 || _padding[last_padding_dimension].second > 0)
+ {
+ break;
+ }
+ }
+ _num_dimensions = last_padding_dimension + 1;
+ if(_num_dimensions > 0)
+ {
+ switch(_mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ configure_constant_mode(input, output, padding, constant_value);
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ configure_reflect_symmetric_mode(input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Padding mode not supported.");
+ }
+ }
+ else
+ {
+ // Copy the input to the whole output if no padding is applied
+ _copy_kernel.configure(input, output);
+ }
+}
+
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
+
+ TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+
+ // Use CLCopyKernel and CLMemsetKernel to validate all padding modes as this includes all of the shape and info validation.
+ PaddingList padding_extended = padding;
+ for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+ {
+ padding_extended.emplace_back(PaddingInfo{ 0, 0 });
+ }
+
+ Window copy_window = Window();
+ for(uint32_t i = 0; i < padded_shape.num_dimensions(); ++i)
+ {
+ copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->dimension(i), 1));
+ }
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), ©_window));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, constant_value));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, &input->clone()->set_tensor_shape(padded_shape), PaddingList(), ©_window));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(&input->clone()->set_tensor_shape(padded_shape), constant_value));
+ }
+
+ switch(mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ for(uint32_t i = 0; i < padding.size(); ++i)
+ {
+ if(mode == PaddingMode::REFLECT)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
+ }
+ }
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Invalid mode");
+ }
+ }
return Status{};
}
void CLPadLayer::run()
{
- CLScheduler::get().enqueue(_memset_kernel, false);
- CLScheduler::get().enqueue(_fillborder_kernel, false);
- CLScheduler::get().enqueue(_copy_kernel, true);
+ if(_num_dimensions > 0)
+ {
+ switch(_mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ CLScheduler::get().enqueue(_memset_kernel, false);
+ CLScheduler::get().enqueue(_copy_kernel, true);
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ if(_padding[i].first > 0 || _padding[i].second > 0)
+ {
+ if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+ {
+ _slice_functions[2 * i].run();
+ }
+ if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+ {
+ _slice_functions[2 * i + 1].run();
+ }
+ CLScheduler::get().sync();
+ _concat_functions[i].run();
+ CLScheduler::get().sync();
+ }
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Padding mode not supported.");
+ }
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_copy_kernel, true);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index b4c20db..959464c 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,8 +29,8 @@
#include <utility>
-using namespace arm_compute;
-
+namespace arm_compute
+{
void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
@@ -54,3 +54,26 @@
{
return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
}
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLComplexPixelWiseMultiplicationKernel>();
+ k->configure(input1, input2, output);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index a13859c..df10e1e 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,54 +21,22 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
-CLQuantizationLayer::CLQuantizationLayer()
- : _quantize_kernel(), _min_max_kernel(), _min_max()
+namespace arm_compute
{
+void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLQuantizationLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
}
Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
- TensorInfo min_max{ input->num_channels(), input->data_type() };
- ARM_COMPUTE_RETURN_ON_ERROR(CLMinMaxLayerKernel::validate(input, &min_max));
- ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(input, output, &min_max));
-
- return Status{};
+ return CLQuantizationLayerKernel::validate(input, output);
}
-
-void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
- _min_max_kernel.configure(input, &_min_max);
-
- // Configure quantize kernel
- _quantize_kernel.configure(input, output, &_min_max);
-
- // Allocate min_max tensor
- _min_max.allocator()->allocate();
-}
-
-void CLQuantizationLayer::run()
-{
- cl::CommandQueue q = CLScheduler::get().queue();
-
- // Reset min and max
- _min_max_kernel.reset(q);
-
- // Run min-max kernel
- CLScheduler::get().enqueue(_min_max_kernel, false);
-
- // Run quantize kernel
- CLScheduler::get().enqueue(_quantize_kernel, false);
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 63f00ac..19eb69f 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -105,7 +105,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_fully_connected_kernel.run();
_gemm_state_f.run();
@@ -114,8 +114,6 @@
// copy hidden out to output
CLScheduler::get().enqueue(_copy_kernel);
-
- _memory_group.release();
}
void CLRNNLayer::prepare()
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index b2d0f81..a3634cd 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLReduceMean.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
#include "arm_compute/core/Types.h"
@@ -40,10 +41,10 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- _reduction_ops = reduction_axis.num_dimensions();
- _reduction_kernels = arm_compute::support::cpp14::make_unique<CLReductionOperation[]>(_reduction_ops);
- _reduced_outs = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
- _keep_dims = keep_dims;
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels.resize(_reduction_ops);
+ _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
Coordinates axis_local = reduction_axis;
const int input_dims = input->info()->num_dimensions();
@@ -57,9 +58,9 @@
// Perform reduction for every axis
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
- TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+ TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
- auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+ auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
if(i == _reduction_ops - 1 && keep_dims)
{
@@ -68,8 +69,8 @@
else
{
_reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
- _memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
+ _memory_group.manage(&_reduced_outs[i]);
+ _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -92,13 +93,15 @@
out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
- _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+ _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
}
}
Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
TensorShape out_shape = input->tensor_shape();
@@ -140,7 +143,7 @@
void CLReduceMean::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
@@ -151,6 +154,5 @@
{
_reshape.run();
}
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 3d82e3f..9f99d2d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -71,7 +71,7 @@
else
{
// Create temporary tensor infos
- auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+ std::vector<TensorInfo> sums_vector(num_of_stages - 1);
// Create intermediate tensor info
TensorShape shape{ input->tensor_shape() };
@@ -110,17 +110,17 @@
}
// Validate ReductionOperation only on first kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, first_kernel_op));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op));
// Validate ReductionOperation on intermediate stages
for(unsigned int i = 1; i < num_of_stages - 1; ++i)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op));
}
// Validate ReductionOperation on the last stage
const unsigned int last_stage = num_of_stages - 1;
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0)));
}
return Status{};
@@ -133,7 +133,7 @@
_is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
// Configure reduction operation kernels
- _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
+ _reduction_kernels_vector.resize(_num_of_stages);
// Create temporary tensors
if(_is_serial)
@@ -142,8 +142,8 @@
}
else
{
- _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
- _results_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ _border_handlers_vector.resize(_num_of_stages);
+ _results_vector.resize(_num_of_stages - 1);
TensorShape shape{ input->info()->tensor_shape() };
for(unsigned int i = 0; i < _num_of_stages - 1; i++)
{
@@ -152,7 +152,7 @@
}
// Apply ReductionOperation only on first kernel
- _memory_group.manage(_results_vector.get());
+ _memory_group.manage(&_results_vector[0]);
ReductionOperation first_kernel_op;
ReductionOperation intermediate_kernel_op;
@@ -183,30 +183,30 @@
ARM_COMPUTE_ERROR("Not supported");
}
- _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+ _reduction_kernels_vector[0].configure(input, &_results_vector[0], axis, first_kernel_op);
_border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
// Apply ReductionOperation on intermediate stages
for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
{
- _memory_group.manage(_results_vector.get() + i);
- _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
- _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+ _memory_group.manage(&_results_vector[i]);
+ _reduction_kernels_vector[i].configure(&_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
+ _border_handlers_vector[i].configure(&_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
_results_vector[i - 1].allocator()->allocate();
}
// Apply ReductionOperation on the last stage
const unsigned int last_stage = _num_of_stages - 1;
const unsigned int input_width = input->info()->dimension(0);
- _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
- _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+ _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width);
+ _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
_results_vector[last_stage - 1].allocator()->allocate();
}
}
void CLReductionOperation::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_is_serial)
{
@@ -220,6 +220,4 @@
CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
}
}
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index d4bc855..22fbef1 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,10 +81,8 @@
{
CLScheduler::get().enqueue(_border_handler, false);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
CLScheduler::get().enqueue(_sobel_hor, false);
CLScheduler::get().enqueue(_sobel_vert);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index 6083090..9b38f69 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,10 +81,8 @@
{
CLScheduler::get().enqueue(_border_handler, false);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
CLScheduler::get().enqueue(_sobel_hor, false);
CLScheduler::get().enqueue(_sobel_vert);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index d671846..7e41dba 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -191,7 +191,7 @@
void CLSoftmaxLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_needs_flattening)
{
@@ -205,9 +205,6 @@
{
CLScheduler::get().enqueue(_reshape_kernel, true);
}
-
- // Relase intermediate buffers
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index f084351..8d37d53 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,8 +42,8 @@
void CLSplit::configure(const ICLTensor *input, const std::vector<ICLTensor *> &outputs, unsigned int axis)
{
// Create Slice functions
- _num_outputs = outputs.size();
- _slice_functions = arm_compute::support::cpp14::make_unique<CLSlice[]>(_num_outputs);
+ _num_outputs = outputs.size();
+ _slice_functions.resize(_num_outputs);
// Get output shape
const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 71327fe..2700b49 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -46,8 +46,8 @@
void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
{
- _num_inputs = input.size();
- _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+ _num_inputs = input.size();
+ _stack_kernels.resize(_num_inputs);
// Wrap around negative values
const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 428d091..eb1dd8c 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,7 +74,7 @@
// Wrap around negative values
const unsigned int axis_u = wrap_axis(axis, input->info());
_num_slices = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
- _strided_slice_vector = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+ _strided_slice_vector.resize(_num_slices);
Coordinates slice_start;
int32_t slice_end_mask;
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index d0801a6..a8667c3 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -51,7 +51,7 @@
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
switch(num_inputs)
@@ -90,7 +90,7 @@
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -109,7 +109,7 @@
break;
default:
// Configure generic case WidthConcatenate kernels
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+ _concat_kernels_vector.resize(_num_inputs);
unsigned int width_offset = 0;
for(unsigned int i = 0; i < _num_inputs; ++i)
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 069196e..d3c3f98 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,6 +62,11 @@
output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
kernel_dims.height == 1 ? 1U : 4U);
}
+ else if(kernel_max_dim == 7U)
+ {
+ output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
+ kernel_dims.height == 1 ? 1U : 2U);
+ }
return output_tile;
}
@@ -73,7 +78,8 @@
std::vector<WinogradConfiguration> fast_math_winograd =
{
- WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
+ WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
};
auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
@@ -198,7 +204,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run input transform
_input_transform.run();
@@ -208,8 +214,6 @@
// Run output transform
CLScheduler::get().enqueue(_output_transform);
-
- _memory_group.release();
}
void CLWinogradConvolutionLayer::prepare()
diff --git a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
deleted file mode 100644
index cd97849..0000000
--- a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
-
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-namespace
-{
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_gemm_reshaped(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
- bool lhs_interleave, bool rhs_interleave)
-{
- GEMMLHSMatrixInfo lhs_info;
- GEMMRHSMatrixInfo rhs_info;
-
- // Configure GEMMLHSMatrixInfo
- lhs_info.m0 = m0;
- lhs_info.k0 = k0;
- lhs_info.v0 = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
- lhs_info.interleave = lhs_interleave;
- lhs_info.transpose = false;
-
- // Configure GEMMRHSMatrixInfo
- rhs_info.n0 = n0;
- rhs_info.k0 = lhs_info.k0;
- rhs_info.h0 = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
- rhs_info.interleave = rhs_interleave;
- rhs_info.transpose = true;
-
- return std::make_pair(lhs_info, rhs_info);
-}
-
-} // namespace
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
- ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
- ARM_COMPUTE_UNUSED(data_type);
-
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-
- // Configurations for Mali-G76
- static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G76 =
- {
- { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G76_f32 },
- { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G76_u8 }
- };
-
- // Configurations for Mali-G7x
- static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G7x =
- {
- { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G7x_f32 },
- { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G7x_u8 }
- };
-
- switch(gpu_target)
- {
- case GPUTarget::G76:
- return (this->*gemm_reshaped_configs_G76[data_type])(m, n, k, b);
- default:
- return (this->*gemm_reshaped_configs_G7x[data_type])(m, n, k, b);
- }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
- ARM_COMPUTE_UNUSED(k);
- ARM_COMPUTE_UNUSED(b);
-
- if(n <= 4)
- {
- return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
- }
- else
- {
- return configure_gemm_reshaped(m, n, 5, 4, 4, 2, 16, false, true);
- }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
- ARM_COMPUTE_UNUSED(k);
- ARM_COMPUTE_UNUSED(b);
-
- if(dot8_supported(CLKernelLibrary::get().get_device()))
- {
- if(n <= 4)
- {
- return configure_gemm_reshaped(m, n, 4, 2, 16, 2, 2, true, false);
- }
- else
- {
- return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, true, false);
- }
- }
- else
- {
- if(n <= 4)
- {
- return configure_gemm_reshaped(m, n, 4, 2, 8, 2, 2, true, false);
- }
- else
- {
- return configure_gemm_reshaped(m, n, 6, 4, 4, 2, 2, true, true);
- }
- }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
- ARM_COMPUTE_UNUSED(k);
- ARM_COMPUTE_UNUSED(b);
-
- if(n <= 4)
- {
- return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
- }
- else
- {
- return configure_gemm_reshaped(m, n, 4, 4, 2, 8, 16, false, false);
- }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
- ARM_COMPUTE_UNUSED(k);
- ARM_COMPUTE_UNUSED(b);
-
- if(n <= 4)
- {
- return configure_gemm_reshaped(m, n, 4, 2, 16, 4, 1, false, false);
- }
- else
- {
- return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, false, true);
- }
-}
-} // namespace cl_gemm
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/tuners/CLLWSList.cpp b/src/runtime/CL/tuners/CLLWSList.cpp
new file mode 100644
index 0000000..30fd558
--- /dev/null
+++ b/src/runtime/CL/tuners/CLLWSList.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
+
+namespace arm_compute
+{
+namespace cl_tuner
+{
+size_t CLLWSList::size()
+{
+ return search_space_shape.total_size();
+}
+
+cl::NDRange CLLWSListExhaustive::operator[](size_t index)
+{
+ ARM_COMPUTE_ERROR_ON(index >= size());
+ auto coords = index2coords(search_space_shape, index);
+ return cl::NDRange{ coords[0] + 1U, coords[1] + 1U, coords[2] + 1U };
+}
+
+CLLWSListExhaustive::CLLWSListExhaustive(const cl::NDRange &gws)
+{
+ ARM_COMPUTE_UNUSED(gws);
+ search_space_shape = TensorShape(max_lws_supported_x,
+ max_lws_supported_y,
+ max_lws_supported_z);
+}
+
+cl::NDRange CLLWSListNormal::operator[](size_t index)
+{
+ ARM_COMPUTE_ERROR_ON(index >= size());
+ auto coords = index2coords(search_space_shape, index);
+ return cl::NDRange{ _lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]] };
+}
+
+CLLWSListNormal::CLLWSListNormal(const cl::NDRange &gws)
+{
+ auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+ auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+ auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+
+ // Initialize the LWS values to test
+ initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+ initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+ initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
+
+ search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+}
+
+void CLLWSListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+{
+ lws.push_back(1);
+
+ for(unsigned int i = 2; i <= lws_max; ++i)
+ {
+ // Power of two condition
+ const bool is_power_of_two = (i & (i - 1)) == 0;
+
+ // Condition for the module accordingly with the mod_let_one flag
+ const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+ if(mod_cond || is_power_of_two)
+ {
+ lws.push_back(i);
+ }
+ }
+}
+
+CLLWSListRapid::CLLWSListRapid(const cl::NDRange &gws)
+{
+ auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
+ auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
+ auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
+
+ // Initialize the LWS values to test
+ initialize_lws_values(_lws_x, lws_x_max);
+ initialize_lws_values(_lws_y, lws_y_max);
+ initialize_lws_values(_lws_z, lws_z_max);
+
+ search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+}
+
+void CLLWSListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max)
+{
+ lws.push_back(1);
+
+ for(unsigned int i = 2; i <= lws_max; i *= 4)
+ {
+ lws.push_back(i);
+ }
+}
+} // namespace cl_tuner
+} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 79e619c..9a141cb 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -600,7 +600,7 @@
if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
{
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
- for(auto it : indices)
+ for(auto const &it : indices)
{
const int label = it.first;
const std::vector<int> &label_indices = it.second;
@@ -614,7 +614,7 @@
for(auto idx : label_indices)
{
ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
- score_index_pairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
+ score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
}
}
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index f3355a7..f7240db 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,16 +54,16 @@
/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
* out of date (or for bare metal mode) */
#ifndef HWCAP_ASIMDHP
-#define HWCAP_ASIMDHP (1 << 10)
-#endif /* HWCAP_ASIMDHP */
+#define HWCAP_ASIMDHP (1 << 10) // NOLINT
+#endif /* HWCAP_ASIMDHP */
#ifndef HWCAP_CPUID
-#define HWCAP_CPUID (1 << 11)
-#endif /* HWCAP_CPUID */
+#define HWCAP_CPUID (1 << 11) // NOLINT
+#endif /* HWCAP_CPUID */
#ifndef HWCAP_ASIMDDP
-#define HWCAP_ASIMDDP (1 << 20)
-#endif /* HWCAP_ASIMDDP */
+#define HWCAP_ASIMDDP (1 << 20) // NOLINT
+#endif /* HWCAP_ASIMDDP */
namespace
{
@@ -146,12 +146,12 @@
break;
}
}
- else if(implementer == 0x48) // HiSilicon CPUs
+ else if(implementer == 0x48)
{
// Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC"
switch(cpunum)
{
- case 0xd40: // A76 (Kirin 980)
+ case 0xd40: // A76
model = CPUModel::GENERIC_FP16_DOT;
break;
default:
@@ -220,8 +220,8 @@
while(bool(getline(file, line)))
{
- regmatch_t match[2];
- ret_status = regexec(&proc_regex, line.c_str(), 2, match, 0);
+ std::array<regmatch_t, 2> match;
+ ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0);
if(ret_status == 0)
{
std::string id = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -244,7 +244,7 @@
continue;
}
- ret_status = regexec(&imp_regex, line.c_str(), 2, match, 0);
+ ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0);
if(ret_status == 0)
{
std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -254,7 +254,7 @@
continue;
}
- ret_status = regexec(&var_regex, line.c_str(), 2, match, 0);
+ ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0);
if(ret_status == 0)
{
std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -264,7 +264,7 @@
continue;
}
- ret_status = regexec(&part_regex, line.c_str(), 2, match, 0);
+ ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0);
if(ret_status == 0)
{
std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -274,7 +274,7 @@
continue;
}
- ret_status = regexec(&rev_regex, line.c_str(), 2, match, 0);
+ ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0);
if(ret_status == 0)
{
std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
@@ -302,8 +302,7 @@
int get_max_cpus()
{
- int max_cpus = 1;
-#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+ int max_cpus = 1;
std::ifstream CPUspresent;
CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
bool success = false;
@@ -341,7 +340,6 @@
{
max_cpus = std::thread::hardware_concurrency();
}
-#endif /* BARE_METAL */
return max_cpus;
}
#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
@@ -427,8 +425,8 @@
std::string line;
while(bool(getline(cpuinfo, line)))
{
- regmatch_t match[2];
- ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match, 0);
+ std::array<regmatch_t, 2> match;
+ ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0);
if(ret_status == 0)
{
std::string cpu_part = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
index 3431834..9e6fce4 100644
--- a/src/runtime/Distribution1D.cpp
+++ b/src/runtime/Distribution1D.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,12 +31,11 @@
using namespace arm_compute;
Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
- : IDistribution1D(num_bins, offset, range), _data(arm_compute::support::cpp14::make_unique<uint32_t[]>(num_bins))
+ : IDistribution1D(num_bins, offset, range), _data(num_bins)
{
}
uint32_t *Distribution1D::buffer() const
{
- ARM_COMPUTE_ERROR_ON(nullptr == _data);
- return _data.get();
+ return _data.data();
}
diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
index fed4a15..f1457c4 100644
--- a/src/runtime/GLES_COMPUTE/GCMemory.cpp
+++ b/src/runtime/GLES_COMPUTE/GCMemory.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,8 +33,8 @@
{
}
-GCMemory::GCMemory(std::shared_ptr<IGCMemoryRegion> memory)
- : _region(nullptr), _region_owned(std::move(memory))
+GCMemory::GCMemory(const std::shared_ptr<IGCMemoryRegion> &memory)
+ : _region(nullptr), _region_owned(memory)
{
_region_owned = memory;
_region = _region_owned.get();
diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
index f781273..6a39e7c 100644
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -97,7 +97,7 @@
ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
ARM_COMPUTE_UNUSED(egl_extension_st);
- const EGLint config_attribs[] =
+ const std::array<EGLint, 3> config_attribs =
{
EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
EGL_NONE
@@ -105,7 +105,7 @@
EGLConfig cfg;
EGLint count;
- res = eglChooseConfig(_display, config_attribs, &cfg, 1, &count);
+ res = eglChooseConfig(_display, config_attribs.data(), &cfg, 1, &count);
ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
ARM_COMPUTE_UNUSED(res);
@@ -114,7 +114,7 @@
ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
- const EGLint attribs[] =
+ const std::array<EGLint, 3> attribs =
{
EGL_CONTEXT_CLIENT_VERSION, 3,
EGL_NONE
@@ -122,7 +122,7 @@
_context = eglCreateContext(_display,
cfg,
EGL_NO_CONTEXT,
- attribs);
+ attribs.data());
ARM_COMPUTE_ERROR_ON_MSG(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
ARM_COMPUTE_UNUSED(res);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
new file mode 100644
index 0000000..506f648
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+GCConcatenateLayer::GCConcatenateLayer()
+ : _concat_kernels(),
+ _num_inputs(0),
+ _axis(Window::DimZ)
+{
+}
+
+void GCConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output, size_t axis)
+{
+ ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+ _num_inputs = inputs_vector.size();
+ _axis = axis;
+
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+
+ unsigned int offset = 0;
+ switch(axis)
+ {
+ case Window::DimZ:
+ {
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ offset += inputs_vector.at(i)->info()->dimension(axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
+ }
+}
+
+void GCConcatenateLayer::run()
+{
+ for(auto &kernel : _concat_kernels)
+ {
+ GCScheduler::get().dispatch(*kernel, true);
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index a35a18a..61c0740 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -201,7 +201,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run im2col
GCScheduler::get().dispatch(_fill_border);
@@ -216,8 +216,6 @@
GCScheduler::get().dispatch(_output_col2im_kernel, false);
GCScheduler::get().memory_barrier();
- _memory_group.release();
-
// Run Activation Layer
if(_is_activationlayer_enabled)
{
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
index aa937a6..b89aafa 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
@@ -47,13 +47,18 @@
unsigned int depth_offset = 0;
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<GCDepthConcatenateLayerKernel[]>(_num_inputs);
- _border_handlers_vector = arm_compute::support::cpp14::make_unique<GCFillBorderKernel[]>(_num_inputs);
+ _concat_kernels_vector.reserve(_num_inputs);
+ _border_handlers_vector.reserve(_num_inputs);
for(unsigned int i = 0; i < _num_inputs; i++)
{
- _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
+ auto concat_kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>();
+ auto border_kernel = support::cpp14::make_unique<GCFillBorderKernel>();
+
+ concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
+ border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+ _concat_kernels_vector.emplace_back(std::move(concat_kernel));
+ _border_handlers_vector.emplace_back(std::move(border_kernel));
depth_offset += inputs_vector.at(i)->info()->dimension(2);
}
@@ -63,8 +68,8 @@
{
for(unsigned i = 0; i < _num_inputs; i++)
{
- GCScheduler::get().dispatch(_border_handlers_vector[i], false);
+ GCScheduler::get().dispatch(*_border_handlers_vector[i].get(), false);
GCScheduler::get().memory_barrier();
- GCScheduler::get().dispatch(_concat_kernels_vector[i], true);
+ GCScheduler::get().dispatch(*_concat_kernels_vector[i].get(), true);
}
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index ba05838..0f772bd 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
@@ -36,8 +36,10 @@
}
void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
+ ARM_COMPUTE_ERROR_ON(dilation.x() != 1 || dilation.y() != 1);
+ ARM_COMPUTE_UNUSED(dilation);
auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
k->configure(input, weights, biases, output, conv_info, depth_multiplier);
_kernel = std::move(k);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index 6b8e341..a208545 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -150,7 +150,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
@@ -173,8 +173,6 @@
GCScheduler::get().dispatch(_accumulate_biases_kernel);
}
-
- _memory_group.release();
}
void GCFullyConnectedLayer::prepare()
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 8ae91ee..ddfe590 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -162,7 +162,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_is_interleaved_transposed)
{
@@ -187,8 +187,6 @@
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_ma_kernel);
}
-
- _memory_group.release();
}
void GCGEMM::prepare()
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index 2569365..8f60279 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
@@ -56,13 +56,11 @@
void GCNormalizationLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
GCScheduler::get().dispatch(_multiply_kernel, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_border_handler, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_norm_kernel, true);
-
- _memory_group.release();
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index dad42cd..0645ae7 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,13 +69,11 @@
void GCSoftmaxLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
GCScheduler::get().dispatch(_max_kernel, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_shift_exp_sum_kernel, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_norm_kernel);
-
- _memory_group.release();
}
diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
index 01640bb..e9f38c4 100644
--- a/src/runtime/HOG.cpp
+++ b/src/runtime/HOG.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,20 +29,19 @@
using namespace arm_compute;
HOG::HOG()
- : IHOG(), _info(), _descriptor(nullptr)
+ : IHOG(), _info(), _descriptor()
{
}
void HOG::init(const HOGInfo &input)
{
- ARM_COMPUTE_ERROR_ON(nullptr != _descriptor);
- _info = input;
- _descriptor = arm_compute::support::cpp14::make_unique<float[]>(_info.descriptor_size());
+ _info = input;
+ _descriptor.resize(_info.descriptor_size());
}
float *HOG::descriptor() const
{
- return _descriptor.get();
+ return _descriptor.data();
}
const HOGInfo *HOG::info() const
diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
index eb9051c..0db5217 100644
--- a/src/runtime/LutAllocator.cpp
+++ b/src/runtime/LutAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,23 +28,23 @@
using namespace arm_compute;
LutAllocator::LutAllocator()
- : _buffer(nullptr)
+ : _buffer()
{
}
uint8_t *LutAllocator::data() const
{
- return _buffer.get();
+ return _buffer.data();
}
void LutAllocator::allocate()
{
- _buffer = arm_compute::support::cpp14::make_unique<uint8_t[]>(size());
+ _buffer.resize(size());
}
uint8_t *LutAllocator::lock()
{
- return _buffer.get();
+ return _buffer.data();
}
void LutAllocator::unlock()
diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index d116624..c6b956d 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,8 +32,8 @@
{
}
-Memory::Memory(std::shared_ptr<IMemoryRegion> memory)
- : _region(nullptr), _region_owned(std::move(memory))
+Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory)
+ : _region(nullptr), _region_owned(memory)
{
_region_owned = memory;
_region = _region_owned.get();
diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
index e0b60b1..154bbd7 100644
--- a/src/runtime/MultiHOG.cpp
+++ b/src/runtime/MultiHOG.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,8 +30,9 @@
using namespace arm_compute;
MultiHOG::MultiHOG(size_t num_models)
- : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<HOG[]>(_num_models))
+ : _num_models(num_models), _model()
{
+ _model.resize(_num_models);
}
size_t MultiHOG::num_models() const
@@ -42,11 +43,11 @@
IHOG *MultiHOG::model(size_t index)
{
ARM_COMPUTE_ERROR_ON(index >= _num_models);
- return (_model.get() + index);
+ return (&_model[index]);
}
const IHOG *MultiHOG::model(size_t index) const
{
ARM_COMPUTE_ERROR_ON(index >= _num_models);
- return (_model.get() + index);
+ return (&_model[index]);
}
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index d33e134..6863bb0 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -57,15 +57,13 @@
void NEArgMinMaxLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_run_fill_border)
{
NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
}
NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
-
- _memory_group.release();
}
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
new file mode 100644
index 0000000..a4db1fd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
+ k->configure(input, block_shape, output);
+ _kernel = std::move(k);
+}
+
+void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
+ k->configure(input, block_shape_x, block_shape_y, output);
+ _kernel = std::move(k);
+}
+
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+ return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
+}
+
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+{
+ return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 0e5d50f..032e617 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -162,7 +162,7 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run sobelNxN
_sobel->run();
@@ -184,6 +184,4 @@
// Run edge tracing
NEScheduler::get().schedule(&_edge_trace, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 21ab47d..71af560 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,9 @@
#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "arm_compute/core/Error.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
@@ -35,56 +38,111 @@
namespace arm_compute
{
NEConcatenateLayer::NEConcatenateLayer()
- : _concat_function(nullptr)
+ : _concat_kernels(),
+ _num_inputs(0),
+ _axis(Window::DimX)
{
}
-void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, DataLayoutDimension axis)
+void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, size_t axis)
{
ARM_COMPUTE_ERROR_ON(output == nullptr);
+ _axis = axis;
+ _num_inputs = inputs_vector.size();
- switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+ std::vector<ITensorInfo *> inputs_vector_info;
+ inputs_vector_info.reserve(_num_inputs);
+ for(unsigned int i = 0; i < _num_inputs; ++i)
{
- case 0:
+ ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
+ inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+ }
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+ ARM_COMPUTE_ERROR_THROW_ON(NEConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+
+ unsigned int offset = 0;
+
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ switch(_axis)
{
- auto func = support::cpp14::make_unique<NEWidthConcatenateLayer>();
- func->configure(inputs_vector, output);
- _concat_function = std::move(func);
- break;
+ case Window::DimX:
+ {
+ auto kernel = support::cpp14::make_unique<NEWidthConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case Window::DimY:
+ {
+ auto kernel = support::cpp14::make_unique<NEHeightConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case Window::DimZ:
+ {
+ auto kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
}
- case 2:
- {
- auto func = support::cpp14::make_unique<NEDepthConcatenateLayer>();
- func->configure(inputs_vector, output);
- _concat_function = std::move(func);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+ offset += inputs_vector.at(i)->info()->dimension(_axis);
}
}
-Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
{
- ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
- switch(get_data_layout_dimension_index(output->data_layout(), axis))
+ unsigned int offset = 0;
+ for(const auto &input : inputs_vector)
{
- case 0:
- ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, output));
- break;
- case 2:
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayer::validate(inputs_vector, output));
- break;
- default:
- ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ switch(axis)
+ {
+ case Window::DimX:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, offset, output));
+ break;
+ }
+ case Window::DimY:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEHeightConcatenateLayerKernel::validate(input, offset, output));
+ break;
+ }
+ case Window::DimZ:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, offset, output));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
+ }
+ offset += input->dimension(axis);
}
+
+ if(output->total_size() != 0)
+ {
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+
return Status{};
}
void NEConcatenateLayer::run()
{
- ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
- _concat_function->run();
+ for(auto &kernel : _concat_kernels)
+ {
+ NEScheduler::get().schedule(kernel.get(), _axis);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index b84dfd3..973855e 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -103,12 +103,10 @@
if(_is_separable)
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-
- _memory_group.release();
}
else
{
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 5059162..a62459b 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -73,6 +73,13 @@
_function = std::move(f);
break;
}
+ case ConvolutionMethod::FFT:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<NEFFTConvolutionLayer>(_memory_manager);
+ f->configure(input, weights, biases, output, conv_info, act_info);
+ _function = std::move(f);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -97,6 +104,10 @@
case ConvolutionMethod::DIRECT:
//Validate Gemm-based Convolution
ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+ case ConvolutionMethod::FFT:
+ // Validate FFT-based convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+ break;
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -148,12 +159,22 @@
return (*found).second;
}
- if(dilation != Size2D(1U, 1U) || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
+ if(dilation != Size2D(1U, 1U))
{
return ConvolutionMethod::GEMM;
}
-
- return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+ else
+ {
+ if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+ {
+ return ConvolutionMethod::FFT;
+ }
+ if(input->dimension(idx_c) < 16)
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+ }
}
void NEConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
new file mode 100644
index 0000000..cc39d02
--- /dev/null
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+NECropResize::NECropResize()
+ : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+{
+}
+
+Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
+ Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
+ TensorInfo temp_info;
+ ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ TensorShape out_shape(input->tensor_shape()[0], crop_size.x, crop_size.y, boxes->tensor_shape()[1]);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), out_shape);
+ }
+ return Status{};
+}
+
+void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
+ InterpolationPolicy method, float extrapolation_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+
+ _num_boxes = boxes->info()->tensor_shape()[1];
+ TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
+
+ _output = output;
+ _method = method;
+ _extrapolation_value = extrapolation_value;
+
+ // For each crop box:
+ // - A crop kernel is used to extract the initial cropped image as specified by boxes[i] from the 3D image input[box_ind[i]].
+ // - A tensor is required to hold this initial cropped image.
+ // - A scale function is used to resize the cropped image to the size specified by crop_size.
+ // - A tensor is required to hold the final scaled image before it is copied into the 4D output
+ // that will hold all final cropped and scaled 3D images.
+ _crop.reserve(_num_boxes);
+ _crop_results.reserve(_num_boxes);
+ _scaled_results.reserve(_num_boxes);
+ _scale.reserve(_num_boxes);
+
+ for(unsigned int i = 0; i < _num_boxes; ++i)
+ {
+ auto crop_tensor = support::cpp14::make_unique<Tensor>();
+ TensorInfo crop_result_info(1, DataType::F32);
+ crop_result_info.set_data_layout(DataLayout::NHWC);
+ crop_tensor->allocator()->init(crop_result_info);
+
+ auto scale_tensor = support::cpp14::make_unique<Tensor>();
+ TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
+ scaled_result_info.set_data_layout(DataLayout::NHWC);
+ scale_tensor->allocator()->init(scaled_result_info);
+
+ auto crop_kernel = support::cpp14::make_unique<NECropKernel>();
+ auto scale_kernel = support::cpp14::make_unique<NEScale>();
+ crop_kernel->configure(input, boxes, box_ind, crop_tensor.get(), i, _extrapolation_value);
+
+ _crop.emplace_back(std::move(crop_kernel));
+ _scaled_results.emplace_back(std::move(scale_tensor));
+ _crop_results.emplace_back(std::move(crop_tensor));
+ _scale.emplace_back(std::move(scale_kernel));
+ }
+}
+
+void NECropResize::run()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
+
+ for(unsigned int i = 0; i < _num_boxes; ++i)
+ {
+ // Size of the crop box in _boxes and thus the shape of _crop_results[i]
+ // may not be known until run-time and so the kernels cannot be configured until then.
+ _crop[i]->configure_output_shape();
+ _crop_results[i]->allocator()->allocate();
+ NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
+
+ // Scale the cropped image.
+ _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false);
+ _scaled_results[i]->allocator()->allocate();
+ _scale[i]->run();
+
+ // Copy scaled image into output.
+ std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+ }
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 44d7197..aff335e 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -51,8 +51,8 @@
unsigned int inner_border_right, unsigned int inner_border_top)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
@@ -68,7 +68,11 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- if(bias != nullptr)
+ if(is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
}
@@ -111,10 +115,11 @@
_inner_border = std::make_pair(inner_border_right, inner_border_top);
_is_prepared = false;
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
- _weights_flipped.allocator()->init(TensorInfo(weights->info()->tensor_shape(), 1, weights->info()->data_type()));
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
_flip_weights.configure(weights, &_weights_flipped);
auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
@@ -159,12 +164,10 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_upsample_f.run();
_conv_f.run();
-
- _memory_group.release();
}
void NEDeconvolutionLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index 49db855..8f070a2 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,26 +45,30 @@
void NEDepthConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output) // NOLINT
{
- _num_inputs = inputs_vector.size();
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
- _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+ _num_inputs = inputs_vector.size();
std::vector<ITensorInfo *> inputs_vector_info;
for(unsigned int i = 0; i < _num_inputs; i++)
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
ARM_COMPUTE_ERROR_THROW_ON(NEDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
unsigned int depth_offset = 0;
+ _concat_kernels_vector.reserve(_num_inputs);
+ _border_handlers_vector.reserve(_num_inputs);
for(unsigned int i = 0; i < _num_inputs; ++i)
{
- _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+ auto concat_kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
+ auto border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+ concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
+ border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+ _border_handlers_vector.emplace_back(std::move(border_kernel));
+ _concat_kernels_vector.emplace_back(std::move(concat_kernel));
depth_offset += inputs_vector.at(i)->info()->dimension(2);
}
@@ -80,7 +84,7 @@
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
unsigned int depth_offset = 0;
@@ -98,7 +102,7 @@
{
for(unsigned i = 0; i < _num_inputs; ++i)
{
- NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
- NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+ NEScheduler::get().schedule(_border_handlers_vector[i].get(), Window::DimX);
+ NEScheduler::get().schedule(_concat_kernels_vector[i].get(), Window::DimX);
}
}
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index f0fd4cf..3bb69b1 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -31,112 +31,79 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
-NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
- : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
- _permuted_weights(), _permuted_output(), _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false),
- _is_activationlayer_enabled(false)
+namespace arm_compute
+{
+NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
+ _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
+ _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
}
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_UNUSED(act_info);
PixelValue zero_value(0.f);
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _has_bias = biases != nullptr;
- _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
- conv_info,
- input->info()->data_type(),
- depth_multiplier,
- input->info()->data_layout());
- _are_weights_reshaped = false;
- _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
- _permute = _is_optimized == _is_nchw;
-
// Initialize the intermediate accumulator tensor in case of quantized input
if(_is_quantized)
{
TensorShape accum_shape = output->info()->tensor_shape();
DataLayout accum_layout = output->info()->data_layout();
- if(!_is_optimized && !_is_nchw)
+ if(!_is_nchw)
{
permute(accum_shape, PermutationVector(1U, 2U, 0U));
accum_layout = DataLayout::NCHW;
}
+ _memory_group.manage(&_accumulator);
_accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
_accumulator.info()->set_data_layout(accum_layout);
zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
}
- if(_is_optimized)
+ if(!_is_nchw)
{
- ITensor *optimized_output = (_is_quantized) ? &_accumulator : output;
- if(_is_nchw)
- {
- // Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
- _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
- // Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
- _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
- // Configure optimized depthwise
- _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
- // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permuted_output.info()->set_data_layout(DataLayout::NHWC);
- _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U));
+ // Configure depthwise
+ _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- _permuted_weights.allocator()->allocate();
- _permuted_output.allocator()->allocate();
- }
- else
- {
- _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC);
- }
+ // Configure border handler
+ _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
}
else
{
- if(!_is_nchw)
- {
- // Configure the function to transform the input tensor from NHWC -> NCHW
- _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
- _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+ // Configure depthwise convolution kernel
+ _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
- // Configure the function to transform the weights tensor from HWI -> IHW
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
- _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-
- // Configure optimized depthwise
- _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
-
- // Configure border handler
- _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- _permuted_weights.allocator()->allocate();
- }
- else
- {
- // Configure depthwise convolution kernel
- _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
-
- // Configure border handler
- _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
- }
+ // Configure border handler
+ _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
}
// Configure biases accumulation
@@ -145,37 +112,138 @@
const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
_accumulator.allocator()->allocate();
}
else if(_has_bias)
{
- _output_stage_kernel.configure((_is_nchw || _is_optimized) ? output : &_permuted_output, biases);
+ _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
}
- if(!_is_optimized && !_is_nchw)
+ // Permute output
+ if(!_is_nchw)
{
// Configure the function to transform the convoluted output to NHWC
_permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
_permuted_output.allocator()->allocate();
}
+}
- //Configure Activation Layer
+void NEDepthwiseConvolutionLayer3x3::configure_optimized(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info)
+{
+ ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+ const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
+ const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
+ _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
+ if(!_is_activationlayer_enabled)
+ {
+ act_info_to_use = act_info;
+ }
+
+ if(_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+ // Configure optimized depthwise
+ _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use);
+
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+ else
+ {
+ _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use);
+ }
+}
+
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+ // idx_w and idx_h only used for validation
+ const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_UNUSED(idx_w);
+ ARM_COMPUTE_UNUSED(idx_h);
+
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _has_bias = biases != nullptr;
+ _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
+ weights->info(),
+ conv_info,
+ depth_multiplier, dilation);
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _permute = _is_optimized == _is_nchw;
+ _is_prepared = false;
_is_activationlayer_enabled = act_info.enabled();
+ // Configure appropriate pipeline
+ if(_is_optimized)
+ {
+ configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ }
+ else
+ {
+ configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ }
+
+ // Configure activation
if(_is_activationlayer_enabled)
{
_activationlayer_function.configure(output, nullptr, act_info);
}
}
-Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
if(biases != nullptr)
{
@@ -184,14 +252,20 @@
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
}
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
-
- if(is_quantized)
+ if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier));
}
//Validate Activation Layer
@@ -203,43 +277,14 @@
return Status{};
}
-void NEDepthwiseConvolutionLayer3x3::run()
+void NEDepthwiseConvolutionLayer3x3::run_generic()
{
- if(_is_first_run && _is_optimized)
- {
- _is_first_run = false;
- // Create convolver (deferred)
- _dwc_kernel.generate_convolver();
- }
-
- // Permute weights
- if(_permute)
- {
- if(!_are_weights_reshaped)
- {
- _are_weights_reshaped = true;
- _permute_weights.run();
- }
-
- _permute_input.run();
- }
-
- // Handle input
- if(!_is_optimized)
- {
- // Fill border
- NEScheduler::get().schedule(&_border_handler, Window::DimX);
- }
+ // Fill border
+ NEScheduler::get().schedule(&_border_handler, Window::DimX);
// Execute depthwise convolution
NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
- // Permute output
- if(_is_optimized && _is_nchw)
- {
- _permute_output.run();
- }
-
// Add biases
if(_has_bias || _is_quantized)
{
@@ -247,17 +292,71 @@
}
// Permute output
- if(!_is_optimized && !_is_nchw)
+ if(!_is_nchw)
{
_permute_output.run();
}
+}
+void NEDepthwiseConvolutionLayer3x3::run_optimized()
+{
+ // Run assembly function
+ _dwc_optimized_func.run();
+
+ // Permute output
+ if(_is_nchw)
+ {
+ _permute_output.run();
+ }
+}
+
+void NEDepthwiseConvolutionLayer3x3::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Permute input
+ if(_permute)
+ {
+ _permute_input.run();
+ }
+
+ _is_optimized ? run_optimized() : run_generic();
+
+ // Run activation
if(_is_activationlayer_enabled)
{
_activationlayer_function.run();
}
}
+void NEDepthwiseConvolutionLayer3x3::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Permute weights
+ if(_permute)
+ {
+ _permuted_weights.allocator()->allocate();
+ _permute_weights.run();
+ _original_weights->mark_as_unused();
+ }
+
+ // Prepare optimized function
+ if(_is_optimized)
+ {
+ _dwc_optimized_func.prepare();
+ if(!_permuted_weights.is_used())
+ {
+ _permuted_weights.allocator()->free();
+ }
+ }
+
+ _is_prepared = true;
+ }
+}
+
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
_permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(),
@@ -266,14 +365,21 @@
}
void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_UNUSED(channel_idx);
-
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
+ // idx_w and idx_h only used for validation
+ const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_UNUSED(idx_w);
+ ARM_COMPUTE_UNUSED(idx_h);
+
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
_is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
@@ -304,7 +410,7 @@
bool append_bias = (biases != nullptr) && !_is_quantized;
// Calculate output shape
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -332,7 +438,7 @@
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
_input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
- _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+ _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -356,7 +462,8 @@
const QuantizationInfo output_quant_info = output->info()->quantization_info();
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
_output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, output_quant_info.offset);
_output_reshaped.allocator()->allocate();
@@ -399,14 +506,17 @@
}
Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) + (weights->dimension(width_idx) - 1) * (dilation.x() - 1) > input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) + (weights->dimension(height_idx) - 1) * (dilation.y() - 1) > input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom());
// Clone output to use auto init
auto output_clone = output->clone();
@@ -433,7 +543,7 @@
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const bool append_bias = (biases != nullptr) && !is_quantized;
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
const size_t weights_w = weights_to_use->dimension(0);
const size_t weights_h = weights_to_use->dimension(1);
const size_t weights_z = weights_to_use->dimension(2);
@@ -460,7 +570,7 @@
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -542,3 +652,4 @@
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 0627977..e92b4bf 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,34 +24,20 @@
#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
-NEDequantizationLayer::NEDequantizationLayer()
- : _dequantize_kernel()
+namespace arm_compute
{
+void NEDequantizationLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEDequantizationLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
}
-Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(input, output, min_max));
-
- return Status{};
+ return NEDequantizationLayerKernel::validate(input, output);
}
-
-void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-
- // Configure kernel
- _dequantize_kernel.configure(input, output, min_max);
-}
-
-void NEDequantizationLayer::run()
-{
- NEScheduler::get().schedule(&_dequantize_kernel, Window::DimY);
-}
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 40e40c8..322bb2c 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -105,7 +105,7 @@
{
NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_conv_kernel, _dim_split);
if(_has_bias)
@@ -117,5 +117,4 @@
{
_activationlayer_function.run();
}
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
new file mode 100644
index 0000000..25ba1c8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+{
+}
+
+void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEFFT1D::validate(input->info(), output->info(), config));
+
+ // Decompose size to radix factors
+ const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->info()->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_ERROR_ON(decomposed_vector.empty());
+
+ // Flags
+ _run_scale = config.direction == FFTDirection::Inverse;
+
+ const bool is_c2r = input->info()->num_channels() == 2 && output->info()->num_channels() == 1;
+
+ // Configure digit reverse
+ FFTDigitReverseKernelInfo digit_reverse_config;
+ digit_reverse_config.axis = config.axis;
+ digit_reverse_config.conjugate = config.direction == FFTDirection::Inverse;
+ TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
+ _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
+ _memory_group.manage(&_digit_reversed_input);
+ _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+
+ // Create and configure FFT kernels
+ unsigned int Nx = 1;
+ _num_ffts = decomposed_vector.size();
+ _fft_kernels.resize(_num_ffts);
+ _axis = config.axis;
+
+ for(unsigned int i = 0; i < _num_ffts; ++i)
+ {
+ const unsigned int radix_for_stage = decomposed_vector.at(i);
+
+ FFTRadixStageKernelInfo fft_kernel_info;
+ fft_kernel_info.axis = config.axis;
+ fft_kernel_info.radix = radix_for_stage;
+ fft_kernel_info.Nx = Nx;
+ fft_kernel_info.is_first_stage = (i == 0);
+ _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+
+ Nx *= radix_for_stage;
+ }
+
+ // Configure scale kernel
+ if(_run_scale)
+ {
+ FFTScaleKernelInfo scale_config;
+ scale_config.scale = static_cast<float>(N);
+ scale_config.conjugate = config.direction == FFTDirection::Inverse;
+ is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+ }
+
+ // Allocate tensors
+ _digit_reversed_input.allocator()->allocate();
+ _digit_reverse_indices.allocator()->allocate();
+
+ // Init digit reverse indices
+ const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector);
+ std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast<unsigned int *>(_digit_reverse_indices.buffer()));
+}
+
+Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+
+ // Check if FFT is decomposable
+ const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ // All combinations are supported except real input with real output (i.e., both input channels set to 1)
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+void NEFFT1D::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ NEScheduler::get().schedule(&_digit_reverse_kernel, (_axis == 0 ? Window::DimY : Window::DimZ));
+
+ for(unsigned int i = 0; i < _num_ffts; ++i)
+ {
+ NEScheduler::get().schedule(&_fft_kernels[i], (_axis == 0 ? Window::DimY : Window::DimX));
+ }
+
+ // Run output scaling
+ if(_run_scale)
+ {
+ NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
new file mode 100644
index 0000000..9210ecf
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+namespace arm_compute
+{
+NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+{
+}
+
+void NEFFT2D::configure(const ITensor *input, ITensor *output, const FFT2DInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEFFT2D::validate(input->info(), output->info(), config));
+
+ // Setup first pass
+ FFT1DInfo first_pass_config;
+ first_pass_config.axis = config.axes.first;
+ first_pass_config.direction = config.direction;
+ _memory_group.manage(&_first_pass_tensor);
+ _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+
+ // Setup second pass
+ FFT1DInfo second_pass_config;
+ second_pass_config.axis = config.axes.second;
+ second_pass_config.direction = config.direction;
+ _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+ _first_pass_tensor.allocator()->allocate();
+}
+
+Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ // Create intermediate tensor info
+ TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2));
+
+ // Validate first pass
+ FFT1DInfo first_pass_config;
+ first_pass_config.axis = config.axes.first;
+ first_pass_config.direction = config.direction;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(input, &first_pass_tensor, first_pass_config));
+
+ // Validate second pass
+ FFT1DInfo second_pass_config;
+ second_pass_config.axis = config.axes.second;
+ second_pass_config.direction = config.direction;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+void NEFFT2D::run()
+{
+ _memory_group.acquire();
+
+ _first_pass_func.run();
+ _second_pass_func.run();
+
+ _memory_group.release();
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
new file mode 100644
index 0000000..0823007
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+int pad_decomposable(int N)
+{
+ const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+
+ int pad = 0;
+ bool is_decomposed = false;
+ while(!is_decomposed)
+ {
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
+ is_decomposed = !decomposed_vector.empty();
+ if(!is_decomposed)
+ {
+ ++pad;
+ }
+ }
+ return pad;
+}
+} // namespace
+
+NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager),
+ _flip_weights_func(),
+ _permute_input_func(),
+ _permute_output_func(),
+ _permute_weights_func(),
+ _permute_bias_func(),
+ _pad_input_func(),
+ _pad_weights_func(),
+ _transform_input_func(memory_manager),
+ _transform_weights_func(),
+ _itransform_output_func(memory_manager),
+ _prod_func(),
+ _reduce_func(),
+ _extract_output_func(),
+ _bias_add_func(),
+ _activation_layer_func(),
+ _permuted_input(),
+ _permuted_weights(),
+ _permuted_bias(),
+ _permuted_output(),
+ _padded_input(),
+ _padded_weights(),
+ _flip_axis(),
+ _flipped_weights(),
+ _transformed_input(),
+ _transformed_weights(),
+ _input_weights_product(),
+ _output_product(),
+ _output_reduced(),
+ _itransformed_output(),
+ _reshaped_output(),
+ _bias_output(),
+ _original_weights(nullptr),
+ _original_bias(nullptr),
+ _is_activationlayer_enabled(false),
+ _needs_permute(false),
+ _has_bias(false),
+ _is_prepared(false)
+{
+}
+
+void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ _original_weights = weights;
+ _original_bias = biases;
+
+ // Flat if bias addition is required
+ _has_bias = biases != nullptr;
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+ pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+ // Tensors to use
+ ITensor *input_to_use = input;
+ const ITensor *weights_to_use = weights;
+ ITensor *output_to_use = _has_bias ? &_bias_output : output;
+
+ // Permute bias
+ if(biases != nullptr)
+ {
+ _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+ _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
+ }
+
+ // Permute input if needed
+ _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
+ if(_needs_permute)
+ {
+ _memory_group.manage(&_permuted_input);
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ }
+
+ // Flip weights
+ _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
+ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+ _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+
+ // Pad weights
+ const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+ _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+
+ // Transform weights
+ _transform_weights_func = support::cpp14::make_unique<NEFFT2D>();
+ _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+
+ // Pad input
+ const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+ _memory_group.manage(&_padded_input);
+ _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+ if(_needs_permute)
+ {
+ _permuted_input.allocator()->allocate();
+ }
+
+ // Transform input
+ _memory_group.manage(&_transformed_input);
+ _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+ _padded_input.allocator()->allocate();
+
+ // Perform product
+ _memory_group.manage(&_output_product);
+ _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+ _transformed_input.allocator()->allocate();
+
+ // Perform reduction
+ _memory_group.manage(&_output_reduced);
+ _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+ _output_product.allocator()->allocate();
+
+ // Transform output
+ _memory_group.manage(&_itransformed_output);
+ FFT2DInfo itranform_info;
+ itranform_info.direction = FFTDirection::Inverse;
+ _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+ _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+ _output_reduced.allocator()->allocate();
+
+ // Reshape output
+ TensorShape reshaped_shape = _itransformed_output.info()->tensor_shape();
+ reshaped_shape.remove_dimension(2);
+ _reshaped_output.allocator()->init(_itransformed_output.info()->clone()->set_tensor_shape(reshaped_shape));
+
+ // Extract correct region
+ const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
+ const int start_top = kernel_size.y() - conv_info.pad_top() - 1;
+ const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+ const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+ if(_has_bias)
+ {
+ _memory_group.manage(&_bias_output);
+ }
+ else if(_needs_permute)
+ {
+ output_to_use = &_permuted_output;
+ _memory_group.manage(&_permuted_output);
+ }
+ _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+ _reshaped_output.allocator()->allocate();
+ _itransformed_output.allocator()->allocate();
+
+ // Add bias
+ if(biases != nullptr)
+ {
+ output_to_use = output;
+ if(_needs_permute)
+ {
+ output_to_use = &_permuted_output;
+ _memory_group.manage(&_permuted_output);
+ }
+ auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
+ _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+ _bias_output.allocator()->allocate();
+ }
+
+ // Permute output
+ if(_needs_permute)
+ {
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ // Allocate tensors
+ _permuted_output.allocator()->allocate();
+ }
+
+ // Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activation_layer_func.configure(output, nullptr, act_info);
+ }
+
+ // Setup flip axis data
+ _flip_axis.allocator()->allocate();
+
+ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+ axis_data[0] = 0;
+ axis_data[1] = 1;
+}
+
+Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+
+ // Strides
+ const auto strides = conv_info.stride();
+ ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+
+ // Validate biases
+ if(biases != nullptr)
+ {
+ const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x());
+ }
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+ }
+ }
+
+ return Status{};
+}
+
+void NEFFTConvolutionLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Transform input
+ if(_needs_permute)
+ {
+ _permute_input_func.run();
+ }
+ _pad_input_func.run();
+ _transform_input_func.run();
+
+ // Perform operations to frequency domain
+ _prod_func.run();
+
+ _reduce_func.run();
+
+ // Transform output
+ _itransform_output_func.run();
+ _reshaped_output.allocator()->import_memory(_itransformed_output.buffer());
+ _extract_output_func.run();
+
+ // Add bias
+ if(_has_bias)
+ {
+ _bias_add_func.run();
+ }
+ if(_needs_permute)
+ {
+ _permute_output_func.run();
+ }
+
+ // Run activation layer
+ if(_is_activationlayer_enabled)
+ {
+ _activation_layer_func.run();
+ }
+}
+
+void NEFFTConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Permute bias to NCHW
+ if(_original_bias != nullptr)
+ {
+ _permuted_bias.allocator()->allocate();
+ _permute_bias_func.run();
+ _original_bias->mark_as_unused();
+ }
+
+ const ITensor *cur_weights = _original_weights;
+
+ // Permute weights
+ if(_needs_permute)
+ {
+ ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
+
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_func.run();
+ cur_weights->mark_as_unused();
+ cur_weights = &_permuted_weights;
+ }
+
+ // Flip weights
+ _flipped_weights.allocator()->allocate();
+ _flip_weights_func.run();
+ cur_weights->mark_as_unused();
+
+ // Pad weights
+ _padded_weights.allocator()->allocate();
+ _pad_weights_func.run();
+ _flipped_weights.mark_as_unused();
+ _flipped_weights.allocator()->free();
+
+ // Transform weights to frequency domain
+ _transformed_weights.allocator()->allocate();
+ _transform_weights_func->run();
+ _transform_weights_func.reset();
+
+ _padded_weights.mark_as_unused();
+ _padded_weights.allocator()->free();
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 4137b1d..af35301 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -93,7 +93,7 @@
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
@@ -103,6 +103,4 @@
}
NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 45e21b5..e1a17db 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -228,7 +228,8 @@
if(_is_quantized)
{
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
_gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
_gemmlowp_output.allocator()->allocate();
@@ -333,7 +334,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
@@ -363,8 +364,6 @@
NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
}
}
-
- _memory_group.release();
}
void NEFullyConnectedLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 914f088..55bcc45 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -238,16 +238,14 @@
{
prepare();
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
if(_asm_glue.is_configured())
{
- _memory_group.acquire();
_asm_glue.run();
- _memory_group.release();
}
else
{
- _memory_group.acquire();
-
if(!_run_vector_matrix_multiplication)
{
// Run interleave kernel
@@ -262,8 +260,6 @@
NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
- _memory_group.release();
-
// Run matrix addition kernel
if(_run_addition)
{
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 470e922..55e067f 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -35,7 +35,7 @@
{
namespace
{
-std::unique_ptr<IFunction> create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info,
+std::unique_ptr<IFunction> create_function_all_types(const arm_gemm::KernelDescription &gemm_kernel_info,
const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
std::shared_ptr<IMemoryManager> memory_manager)
@@ -375,7 +375,7 @@
void NEGEMMAssemblyDispatch::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_function != nullptr)
{
_function->run();
@@ -385,6 +385,5 @@
ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
_arm_gemm->run();
}
- _memory_group.release();
}
} //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index be7cc2d..a2c4e8a 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,16 +90,17 @@
}
NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
- _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
- _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
+ : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
+ _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false),
+ _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
}
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, int gemm_3d_depth)
+void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(), act_info, gemm_3d_depth,
+ _skip_im2col));
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
@@ -114,7 +115,41 @@
input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
- _mm_gemmlowp.configure(input, weights, nullptr, output, gemm_info);
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quantization_info : output->info()->quantization_info();
+
+ float multiplier = input_quantization_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier;
+ int output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+ // Merge activation with output stage
+ int min_activation = 0;
+ int max_activation = 0;
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+ if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+ _is_activationlayer_enabled = false;
+ }
+
+ GEMMLowpOutputStageInfo output_info;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = output_quant_info.offset;
+ output_info.gemmlowp_multiplier = output_multiplier;
+ output_info.gemmlowp_shift = output_shift;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+
+ _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
// Revert back QuantizatioInfo as input and weights could be used in other convolution layers
input->info()->set_quantization_info(input_quantization_info);
@@ -127,9 +162,11 @@
}
}
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info,
+ int gemm_3d_depth, bool skip_im2col)
{
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool is_activation_enabled = act_info.enabled();
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
@@ -145,8 +182,40 @@
input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+ const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quantization_info : output->quantization_info();
+
+ float multiplier = input_quantization_info.scale * weights->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier;
+ int output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+ // Merge activation with output stage
+ int min_activation = 0;
+ int max_activation = 0;
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+ if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+ }
+
+ GEMMLowpOutputStageInfo output_info;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = output_quant_info.offset;
+ output_info.gemmlowp_multiplier = output_multiplier;
+ output_info.gemmlowp_shift = output_shift;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+
// Perform validation step on GEMMLowp
- return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), nullptr, output, gemm_info);
+ return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
}
else
{
@@ -155,19 +224,18 @@
}
}
-Status NEGEMMConvolutionLayer::validate_gemm3d(DataType data_type, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
{
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const DataType output_gemm_data_type = is_quantized ? DataType::S32 : data_type;
- const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
- const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
+ const DataType data_type = input_info->data_type();
+ const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
+ const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
// Set dummy tensor shapes for the validation
- const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type);
+ const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type);
- const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, output_gemm_data_type);
+ const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
- return validate_mm(&dummy_input_info, &dummy_weights_info, &dummy_output_info, gemm_3d_depth, skip_im2col);
+ return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
}
void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
@@ -202,9 +270,8 @@
_append_bias = (biases != nullptr) && (!_is_quantized);
_is_activationlayer_enabled = act_info.enabled();
- const ITensor *gemm_input_to_use = input;
- ITensor *gemm_output_to_use = output;
- ITensor *gemm_output_staged_to_use = output;
+ const ITensor *gemm_input_to_use = input;
+ ITensor *gemm_output_to_use = output;
// Get convolved dimensions
unsigned int conv_w = 0;
@@ -219,7 +286,7 @@
// Check if GEMM3D is supported
if(data_layout == DataLayout::NHWC)
{
- _skip_col2im = bool(validate_gemm3d(input->info()->data_type(), conv_h, true));
+ _skip_col2im = bool(validate_gemm3d(input->info(), act_info, conv_h, true));
// If not supported, we need to perform im2col and col2im (or reshape layer)
if(!_skip_col2im)
{
@@ -262,26 +329,17 @@
}
// Create temporary GEMM output tensor in case we cannot skip col2im
- if(!_skip_col2im || _is_quantized)
+ if(!_skip_col2im)
{
- // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
- const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
- TensorShape shape_gemm;
+ TensorShape shape_gemm;
- if(_is_quantized && _skip_col2im)
- {
- shape_gemm = output->info()->tensor_shape();
- }
- else
- {
- // Calculate GEMM output shape
- shape_gemm = _im2col_output.info()->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
- }
+ // Calculate GEMM output shape
+ shape_gemm = _im2col_output.info()->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
// FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
- TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
+ TensorInfo info_gemm(shape_gemm, 1, data_type);
info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
_gemm_output.allocator()->init(info_gemm);
_memory_group.manage(&_gemm_output);
@@ -293,62 +351,24 @@
// Configure GEMM
// In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
- configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, gemm_3d_depth);
+ configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, gemm_3d_depth);
if(!_skip_im2col)
{
_im2col_output.allocator()->allocate();
}
- // Configure output stage for quantized case
- if(_is_quantized)
- {
- const QuantizationInfo input_quant_info = input->info()->quantization_info();
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quant_info : output->info()->quantization_info();
-
- float multiplier = input_quant_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
- if(!_skip_col2im)
- {
- _memory_group.manage(&_tmp_output);
- gemm_output_staged_to_use = &_tmp_output;
- }
-
- // Merge activation with output stage
- int min_activation = 0;
- int max_activation = 0;
-
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
- {
- const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
- const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
- max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
- _is_activationlayer_enabled = false;
- }
-
- _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset, min_activation, max_activation);
- }
-
if(!_skip_col2im)
{
if(_data_layout == DataLayout::NCHW)
{
// Configure col2im
- _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, Size2D(conv_w, conv_h));
+ _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
}
else
{
// Configure reshape layer
- _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output);
+ _reshape_layer.configure(gemm_output_to_use, output);
}
}
@@ -394,11 +414,13 @@
const unsigned int kernel_width = weights->dimension(idx_width);
const unsigned int kernel_height = weights->dimension(idx_height);
- TensorInfo im2col_reshaped_info, info_gemm, tmp_info, weights_reshaped_info;
- const ITensorInfo *gemm_input_to_use = input;
- const ITensorInfo *gemm_output_to_use = output;
- const ITensorInfo *gemm_output_staged_to_use = output;
- const ITensorInfo *weights_to_use = weights;
+ TensorInfo im2col_reshaped_info{};
+ TensorInfo info_gemm{};
+ TensorInfo tmp_info{};
+ TensorInfo weights_reshaped_info{};
+ const ITensorInfo *gemm_input_to_use = input;
+ const ITensorInfo *gemm_output_to_use = output;
+ const ITensorInfo *weights_to_use = weights;
const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
const bool append_bias = (biases != nullptr) && (!is_quantized);
@@ -420,7 +442,7 @@
bool skip_col2im = false;
if(data_layout == DataLayout::NHWC)
{
- skip_col2im = bool(validate_gemm3d(input->data_type(), conv_h, true));
+ skip_col2im = bool(validate_gemm3d(input, act_info, conv_h, true));
// If not supported, we need to perform im2col and col2im (or reshape layer)
if(!skip_col2im)
{
@@ -431,7 +453,7 @@
if(skip_col2im)
{
// If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!bool(validate_gemm3d(input->data_type(), conv_h, skip_im2col)))
+ if(!bool(validate_gemm3d(input, act_info, conv_h, skip_im2col)))
{
skip_im2col = false;
skip_col2im = false;
@@ -495,68 +517,25 @@
}
// Create temporary GEMM output tensor in case we cannot skip col2im
- const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
if(!skip_col2im)
{
TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, conv_w * conv_h);
- info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
+ info_gemm = TensorInfo(shape_gemm, 1, data_type);
}
else
{
- info_gemm = TensorInfo(output->tensor_shape(), 1, gemm_data_type);
+ info_gemm = TensorInfo(output->tensor_shape(), 1, data_type);
}
info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
gemm_output_to_use = &info_gemm;
-
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 0, skip_im2col));
-
- if(is_quantized)
- {
- const QuantizationInfo input_quant_info = input->quantization_info();
- const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quant_info : output->quantization_info();
- const float multiplier = input_quant_info.scale * weights_to_use->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
- if(!skip_col2im)
- {
- tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
- tmp_info.set_quantization_info(output->quantization_info()).set_data_layout(data_layout);
- gemm_output_staged_to_use = &tmp_info;
- }
-
- // Merge activation with output stage
- int min_activation = 0;
- int max_activation = 0;
-
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
-
- if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
- {
- const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
- const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
- max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
- is_activation_enabled = false;
- }
-
- // Validate output stage for quantized case
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min_activation, max_activation);
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
// Validate Col2Im/ReshapeLayer
if(!skip_col2im && (data_layout == DataLayout::NCHW))
{
- ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
- output,
- Size2D(conv_w, conv_h)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
}
//Validate Activation Layer
@@ -572,7 +551,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(!_skip_im2col)
{
@@ -586,9 +565,6 @@
{
// Run gemmlowp
_mm_gemmlowp.run();
-
- // Run output stage
- _gemmlowp_output_stage.run();
}
else
{
@@ -618,8 +594,6 @@
{
_activationlayer_function.run();
}
-
- _memory_group.release();
}
void NEGEMMConvolutionLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 47c3358..ede89bf 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -1,4 +1,5 @@
-/* Copyright (c) 2017-2018 ARM Limited.
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -116,7 +117,7 @@
void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_mtx_a_reshape_kernel)
{
NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
@@ -135,6 +136,4 @@
{
NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
}
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 5286f11..54f49a6 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,8 +42,8 @@
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
- _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false),
- _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+ _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr), _a_offset(0), _b_offset(0),
+ _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false)
{
}
@@ -53,6 +53,9 @@
ARM_COMPUTE_UNUSED(c);
ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+ const ITensor *matrix_a = a;
+ const ITensor *matrix_b = b;
+
// Clear state
_mtx_a_reshape_kernel = nullptr;
_mtx_b_reshape_kernel = nullptr;
@@ -65,6 +68,18 @@
_is_prepared = false;
_original_b = b;
+ // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+ if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ _fuse_output_stage = true;
+
+ _memory_group.manage(&_mm_result_s32);
+
+ TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+
+ _mm_result_s32.allocator()->init(info_mm_result_s32);
+ }
+
#ifdef __aarch64__
switch(a->info()->data_type())
{
@@ -72,7 +87,7 @@
case DataType::U8:
case DataType::S8:
{
- _asm_glue.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run);
+ _asm_glue.configure(a, b, _fuse_output_stage ? &_mm_result_s32 : output, 1.f, 0.f, _reshape_b_only_on_first_run);
_dot_product_path = _asm_glue.is_configured();
break;
}
@@ -83,51 +98,35 @@
}
}
#endif /* __aarch64__ */
- if(!_dot_product_path)
+ if(!(_dot_product_path || _run_vector_matrix_multiplication))
{
- if(_run_vector_matrix_multiplication)
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
+ _tmp_a.allocator()->init(a_info);
+ _tmp_b.allocator()->init(b_info);
+ _memory_group.manage(&_tmp_a);
+ if(!_reshape_b_only_on_first_run)
{
- // Configure matrix multiply kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- k->configure(a, b, output);
- _mm_kernel = std::move(k);
- }
+ _memory_group.manage(&_tmp_b);
}
- else
+
+ // Configure interleave kernel
{
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo info_a = a->info()->clone()->set_tensor_shape(compute_interleaved_shape(*a->info())).set_is_resizable(true);
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorInfo info_b = b->info()->clone()->set_tensor_shape(compute_transpose1xW_shape(*b->info())).set_is_resizable(true);
- _tmp_a.allocator()->init(info_a);
- _tmp_b.allocator()->init(info_b);
- _memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ k->configure(a, &_tmp_a);
+ _mtx_a_reshape_kernel = std::move(k);
+ }
- // Configure interleave kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
- k->configure(a, &_tmp_a);
- _mtx_a_reshape_kernel = std::move(k);
- }
-
- // Configure transpose kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
- k->configure(b, &_tmp_b);
- _mtx_b_reshape_kernel = std::move(k);
- }
-
- // Configure matrix multiply kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- k->configure(&_tmp_a, &_tmp_b, output);
- _mm_kernel = std::move(k);
- }
+ // Configure transpose kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ k->configure(b, &_tmp_b);
+ _mtx_b_reshape_kernel = std::move(k);
}
}
@@ -158,8 +157,33 @@
_mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
}
- // Configure offset contribution kernel
- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+ if(_fuse_output_stage)
+ {
+ // Configure matrix multiply kernel
+ if(!_dot_product_path)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(matrix_a, matrix_b, &_mm_result_s32);
+ _mm_kernel = std::move(k);
+ }
+
+ _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
+ _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+
+ _mm_result_s32.allocator()->allocate();
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ if(!_dot_product_path)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(matrix_a, matrix_b, output);
+ _mm_kernel = std::move(k);
+ }
+ // Configure offset contribution kernel
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+ }
// Allocate tensors
if(!_dot_product_path && !_run_vector_matrix_multiplication)
@@ -185,43 +209,53 @@
Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
"The product AB is defined only if the number of columns in A is equal to the number of rows in B");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo mm_result_s32_info{};
+
int32_t a_offset = a->quantization_info().offset;
int32_t b_offset = b->quantization_info().offset;
const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+ if(fuse_output_stage)
+ {
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ }
+
// Check if we need to run the optimized assembly kernel
- const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, output, 1.f, 0.f, reshape_b_only_on_first_run));
+ const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, reshape_b_only_on_first_run));
if(run_optimised)
{
- if(output->total_size() != 0)
+ ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+ if(gemm_info.depth_output_gemm3d() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 0)
+ if(gemm_info.reinterpret_input_as_3d())
{
- if(gemm_info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
- }
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
}
}
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ }
}
else
{
@@ -231,6 +265,9 @@
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
if(!run_vector_matrix_multiplication)
{
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
TensorShape shape_tmp_a = a->tensor_shape();
shape_tmp_a.set(0, a->dimension(0) * 4);
@@ -241,20 +278,17 @@
shape_tmp_b.set(0, b->dimension(1) * 16);
shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
- TensorInfo info_a = a->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
- TensorInfo info_b = b->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
}
}
- TensorInfo info_vector_sum_col, info_vector_sum_row;
+ TensorInfo info_vector_sum_col{};
+ TensorInfo info_vector_sum_row{};
// Validate matrix B reduction kernel only if _a_offset is not equal to 0
if(a_offset != 0)
@@ -274,12 +308,32 @@
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
}
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- a_offset, b_offset));
+ if(fuse_output_stage)
+ {
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+ }
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c, output, a_offset, b_offset,
+ gemm_info.gemmlowp_output_stage()));
+ }
+ else
+ {
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+ }
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ a_offset, b_offset));
+ }
return Status{};
}
@@ -287,7 +341,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Reshape inputs
if(_mtx_a_reshape_kernel)
@@ -321,10 +375,16 @@
NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
}
- // Run offset contribution kernel
- NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
-
- _memory_group.release();
+ if(_fuse_output_stage)
+ {
+ // Run offset contribution kernel
+ NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+ }
+ else
+ {
+ // Run offset contribution kernel
+ NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+ }
}
void NEGEMMLowpMatrixMultiplyCore::prepare()
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index b010ca0..3c7411e 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,10 +59,8 @@
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index 8a85bba..0dbcb12 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,11 +74,6 @@
if(num_levels > 1)
{
- _horizontal_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
- _vertical_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
-
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
@@ -86,19 +81,33 @@
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
_tmp.init(pyramid_info);
+ _horizontal_reduction.reserve(num_levels);
+ _vertical_reduction.reserve(num_levels);
+ _horizontal_border_handler.reserve(num_levels);
+ _vertical_border_handler.reserve(num_levels);
+
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
/* Configure horizontal kernel */
- _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+ auto horizontal_kernel = support::cpp14::make_unique<NEGaussianPyramidHorKernel>();
+ horizontal_kernel->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
/* Configure vertical kernel */
- _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+ auto vertical_kernel = support::cpp14::make_unique<NEGaussianPyramidVertKernel>();
+ vertical_kernel->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
/* Configure border */
- _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+ auto horizontal_border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+ horizontal_border_kernel->configure(_pyramid->get_pyramid_level(i), horizontal_kernel->border_size(), border_mode, PixelValue(constant_border_value));
/* Configure border */
- _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+ auto vertical_border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+ vertical_border_kernel->configure(_tmp.get_pyramid_level(i), vertical_kernel->border_size(), border_mode, PixelValue(pixel_value_u16));
+
+ _vertical_border_handler.emplace_back(std::move(vertical_border_kernel));
+ _horizontal_border_handler.emplace_back(std::move(horizontal_border_kernel));
+ _vertical_reduction.emplace_back(std::move(vertical_kernel));
+ _horizontal_reduction.emplace_back(std::move(horizontal_kernel));
}
_tmp.allocate();
@@ -117,10 +126,10 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- NEScheduler::get().schedule(_horizontal_border_handler.get() + i, Window::DimZ);
- NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
- NEScheduler::get().schedule(_vertical_border_handler.get() + i, Window::DimZ);
- NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
+ NEScheduler::get().schedule(_horizontal_border_handler[i].get(), Window::DimZ);
+ NEScheduler::get().schedule(_horizontal_reduction[i].get(), Window::DimY);
+ NEScheduler::get().schedule(_vertical_border_handler[i].get(), Window::DimZ);
+ NEScheduler::get().schedule(_vertical_reduction[i].get(), Window::DimY);
}
}
@@ -147,19 +156,20 @@
if(num_levels > 1)
{
- _gaus5x5 = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
- _scale_nearest = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
-
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
_tmp.init(pyramid_info);
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
/* Configure gaussian 5x5 */
- _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+ auto gaus5x5_kernel = support::cpp14::make_unique<NEGaussian5x5>();
+ gaus5x5_kernel->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+ _gaus5x5.emplace_back(std::move(gaus5x5_kernel));
/* Configure scale */
- _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+ auto scale_kernel = support::cpp14::make_unique<NEScale>();
+ scale_kernel->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+ _scale_nearest.emplace_back(std::move(scale_kernel));
}
_tmp.allocate();
@@ -178,7 +188,7 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- _gaus5x5[i].run();
- _scale_nearest[i].run();
+ _gaus5x5[i].get()->run();
+ _scale_nearest[i].get()->run();
}
}
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index 5e98269..8efc091 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -95,7 +95,7 @@
void NEHOGDescriptor::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run gradient
_gradient.run();
@@ -105,6 +105,4 @@
// Run block normalization kernel
NEScheduler::get().schedule(&_block_norm, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index efc8690..90785fe 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,13 +80,11 @@
void NEHOGGradient::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run derivative
_derivative.run();
// Run magnitude/phase kernel
NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 8c834e2..26abc9d 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -126,12 +126,12 @@
_num_block_norm_kernel = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
_num_hog_detect_kernel = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
- _orient_bin_kernel = arm_compute::support::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
- _block_norm_kernel = arm_compute::support::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
- _hog_detect_kernel = arm_compute::support::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+ _orient_bin_kernel.reserve(_num_orient_bin_kernel);
+ _block_norm_kernel.reserve(_num_block_norm_kernel);
+ _hog_detect_kernel.reserve(_num_hog_detect_kernel);
+ _hog_space.reserve(_num_orient_bin_kernel);
+ _hog_norm_space.reserve(_num_block_norm_kernel);
_non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
- _hog_space = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
- _hog_norm_space = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
// Allocate tensors for magnitude and phase
TensorInfo info_mag(shape_img, Format::S16);
@@ -167,13 +167,17 @@
// Allocate HOG space
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
- _hog_space[i].allocator()->init(info_space);
+ auto hog_space_tensor = support::cpp14::make_unique<Tensor>();
+ hog_space_tensor->allocator()->init(info_space);
// Manage intermediate buffers
- _memory_group.manage(_hog_space.get() + i);
+ _memory_group.manage(hog_space_tensor.get());
// Initialise orientation binning kernel
- _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ auto orient_bin_kernel = support::cpp14::make_unique<NEHOGOrientationBinningKernel>();
+ orient_bin_kernel->configure(&_mag, &_phase, hog_space_tensor.get(), multi_hog->model(idx_multi_hog)->info());
+ _orient_bin_kernel.emplace_back(std::move(orient_bin_kernel));
+ _hog_space.emplace_back(std::move(hog_space_tensor));
}
// Allocate intermediate tensors
@@ -188,19 +192,23 @@
// Allocate normalized HOG space
TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
- _hog_norm_space[i].allocator()->init(tensor_info);
+ auto hog_norm_space_tensor = support::cpp14::make_unique<Tensor>();
+ hog_norm_space_tensor->allocator()->init(tensor_info);
// Manage intermediate buffers
- _memory_group.manage(_hog_norm_space.get() + i);
+ _memory_group.manage(hog_norm_space_tensor.get());
// Initialize block normalization kernel
- _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ auto block_norm_kernel = support::cpp14::make_unique<NEHOGBlockNormalizationKernel>();
+ block_norm_kernel->configure(_hog_space[idx_orient_bin].get(), hog_norm_space_tensor.get(), multi_hog->model(idx_multi_hog)->info());
+ _block_norm_kernel.emplace_back(std::move(block_norm_kernel));
+ _hog_norm_space.emplace_back(std::move(hog_norm_space_tensor));
}
// Allocate intermediate tensors
for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
{
- _hog_space[i].allocator()->allocate();
+ _hog_space[i].get()->allocator()->allocate();
}
// Configure HOG detector kernel
@@ -208,7 +216,9 @@
{
const size_t idx_block_norm = input_hog_detect[i];
- _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+ auto hog_detect_kernel = support::cpp14::make_unique<NEHOGDetector>();
+ hog_detect_kernel->configure(_hog_norm_space[idx_block_norm].get(), multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+ _hog_detect_kernel.emplace_back(std::move(hog_detect_kernel));
}
// Configure non maxima suppression kernel
@@ -217,7 +227,7 @@
// Allocate intermediate tensors
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
- _hog_norm_space[i].allocator()->allocate();
+ _hog_norm_space[i]->allocator()->allocate();
}
}
@@ -225,7 +235,7 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Reset detection window
_detection_windows->clear();
@@ -234,21 +244,21 @@
_gradient_kernel.run();
// Run orientation binning kernel
- for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ for(auto &kernel : _orient_bin_kernel)
{
- NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
+ NEScheduler::get().schedule(kernel.get(), Window::DimY);
}
// Run block normalization kernel
- for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+ for(auto &kernel : _block_norm_kernel)
{
- NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
+ NEScheduler::get().schedule(kernel.get(), Window::DimY);
}
// Run HOG detector kernel
- for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+ for(auto &kernel : _hog_detect_kernel)
{
- _hog_detect_kernel[i].run();
+ kernel->run();
}
// Run non-maxima suppression kernel if enabled
@@ -256,6 +266,4 @@
{
NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
}
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index db5e926..3eadbee 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,7 +90,7 @@
_score.allocator()->init(tensor_info_score);
_nonmax.allocator()->init(tensor_info_score);
- _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+ _corners_list.resize(shape.x() * shape.y());
// Set/init Sobel kernel accordingly with gradient_size
switch(gradient_size)
@@ -171,20 +171,20 @@
_score.allocator()->allocate();
// Init corner candidates kernel
- _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+ _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
// Allocate once all the configure methods have been called
_nonmax.allocator()->allocate();
// Init euclidean distance
- _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
+ _sort_euclidean.configure(_corners_list.data(), corners, &_num_corner_candidates, min_dist);
}
void NEHarrisCorners::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Init to 0 number of corner candidates
_num_corner_candidates = 0;
@@ -207,6 +207,4 @@
// Run sort & euclidean distance
NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index f333ecb..d56bd7c 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,7 +34,7 @@
using namespace arm_compute;
NEHistogram::NEHistogram()
- : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::support::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+ : _histogram_kernel(), _local_hist(), _window_lut(window_lut_default_size), _local_hist_size(0)
{
}
@@ -45,10 +45,10 @@
// Allocate space for threads local histograms
_local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
- _local_hist = arm_compute::support::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+ _local_hist.resize(_local_hist_size);
// Configure kernel
- _histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());
+ _histogram_kernel.configure(input, output, _local_hist.data(), _window_lut.data());
}
void NEHistogram::run()
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 56da966..c9ab5c9 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,11 +68,9 @@
void NEL2NormalizeLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_reduce_func.run();
NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
-
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 9e7a713..3d3c6a1 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -43,10 +43,10 @@
_pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
_pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
_accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
- _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
- _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
- _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
- _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+ _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+ _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
+ _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(),
+ _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false)
{
}
@@ -96,22 +96,32 @@
// Configure block that calculates the forget gate
// forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
- TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ // We optimize this as follows:
+ // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
_forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
_forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _memory_group.manage(&_forget_gate_out1);
- _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+ std::vector<const ITensor *> inputs_vector;
+ inputs_vector.emplace_back(input);
+ inputs_vector.emplace_back(output_state_in);
+
_memory_group.manage(&_forget_gate_out2);
- _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
- _memory_group.manage(&_forget_gate_out3);
- _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
- _forget_gate_out2.allocator()->allocate();
+ _concat_inputs_forget_gate.configure(inputs_vector, &_forget_gate_out2);
+
+ std::vector<const ITensor *> weights_vector;
+
+ weights_vector.emplace_back(input_to_forget_weights);
+ weights_vector.emplace_back(recurrent_to_forget_weights);
+
+ _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6);
+
_memory_group.manage(&_forget_gate_out5);
- _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
- _forget_gate_out1.allocator()->allocate();
+ _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+ _memory_group.manage(&_forget_gate_out1);
+ _memory_group.manage(&_forget_gate_out3);
+ _forget_gate_out6.allocator()->allocate();
+
Tensor *forget_gate_out = &_forget_gate_out5;
if(lstm_params.has_peephole_opt())
{
@@ -134,6 +144,8 @@
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
// input_gate = 1 - forget_gate, with CIFG
+ // We optimize this as follows:
+ // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
Tensor *input_gate_out = &_input_gate_out1;
if(lstm_params.has_cifg_opt())
@@ -146,31 +158,29 @@
}
else
{
- TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-
- _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
_input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ std::vector<const ITensor *> lstm_weights;
+ lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+ lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+
+ _concat_weights_input_gate.configure(lstm_weights, &_input_gate_out2);
_memory_group.manage(&_input_gate_out1);
- _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
- _memory_group.manage(&_input_gate_out2);
- _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
- _memory_group.manage(&_input_gate_out3);
- _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
- _input_gate_out2.allocator()->allocate();
_memory_group.manage(&_input_gate_out4);
- _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
- _input_gate_out3.allocator()->allocate();
- input_gate_out = &_input_gate_out4;
+
+ _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
+ _input_gate_out2.allocator()->allocate();
+ input_gate_out = &_input_gate_out3;
+
if(_run_peephole_opt)
{
- _memory_group.manage(&_input_gate_out5);
- _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _memory_group.manage(&_input_gate_out4);
+ _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
- _input_gate_out5.allocator()->allocate();
input_gate_out = &_input_gate_out1;
}
else
@@ -215,35 +225,37 @@
// Configure block that calculates the output
// output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
- TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ // We optimize this as follows:
+ // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
_output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
- _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ std::vector<const ITensor *> in_out_weights;
+ in_out_weights.emplace_back(input_to_output_weights);
+ in_out_weights.emplace_back(recurrent_to_output_weights);
+
+ _concat_weights_output.configure(in_out_weights, &_output2);
_memory_group.manage(&_output1);
- _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
- _memory_group.manage(&_output2);
- _transpose_output.configure(recurrent_to_output_weights, &_output2);
- _memory_group.manage(&_output3);
- _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+ _memory_group.manage(&_output4);
+
+ _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+
_output2.allocator()->allocate();
- _memory_group.manage(&_output5);
- _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
- _output3.allocator()->allocate();
- Tensor *output_gate_out = &_output5;
+ _forget_gate_out2.allocator()->allocate();
+
+ Tensor *output_gate_out = &_output4;
if(lstm_params.has_peephole_opt())
{
- _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+ _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
- _memory_group.manage(&_output4);
- _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
- _output5.allocator()->allocate();
+ _memory_group.manage(&_output3);
+ _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+ _output4.allocator()->allocate();
output_gate_out = &_output1;
// Allocate intermediate buffers
- _output4.allocator()->allocate();
+ _output3.allocator()->allocate();
}
else
{
@@ -368,10 +380,15 @@
TensorInfo output_gate_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+ std::vector<const ITensorInfo *> inputs_vector;
+ inputs_vector.emplace_back(input);
+ inputs_vector.emplace_back(output_state_in);
+ TensorInfo forget_gate_concat;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, &forget_gate_concat));
+
// Validate forget gate
ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
@@ -389,9 +406,13 @@
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
+ std::vector<const ITensorInfo *> lstm_weights;
+ lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+ lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+ TensorInfo lstm_gate_concat;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(lstm_weights, &lstm_gate_concat));
ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
@@ -421,9 +442,14 @@
}
// Validate output gate tmp
+ std::vector<const ITensorInfo *> in_out_weights;
+ in_out_weights.emplace_back(input_to_output_weights);
+ in_out_weights.emplace_back(recurrent_to_output_weights);
+ TensorInfo in_out_gate_concat;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(in_out_weights, &in_out_gate_concat));
+
ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
@@ -465,12 +491,12 @@
void NELSTMLayer::run()
{
- _memory_group.acquire();
+ prepare();
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _concat_inputs_forget_gate.run();
_fully_connected_forget_gate.run();
- NEScheduler::get().schedule(&_transpose_forget_gate, Window::DimY);
- _gemm_forget_gate.run();
- NEScheduler::get().schedule(&_accum_forget_gate1, Window::DimY);
if(_run_peephole_opt)
{
@@ -494,9 +520,7 @@
else
{
_fully_connected_input_gate.run();
- NEScheduler::get().schedule(&_transpose_input_gate, Window::DimY);
- _gemm_input_gate.run();
- NEScheduler::get().schedule(&_accum_input_gate1, Window::DimY);
+
if(_run_peephole_opt)
{
NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
@@ -520,10 +544,6 @@
}
_fully_connected_output.run();
- NEScheduler::get().schedule(&_transpose_output, Window::DimY);
- _gemm_output.run();
- NEScheduler::get().schedule(&_accum_output1, Window::DimY);
-
if(_run_peephole_opt)
{
NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
@@ -547,6 +567,18 @@
NEScheduler::get().schedule(&_copy_output, Window::DimY);
_concat_scratch_buffer.run();
+}
- _memory_group.release();
-}
\ No newline at end of file
+void NELSTMLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ _concat_weights_forget_gate.run();
+ if(!_run_cifg_opt)
+ {
+ _concat_weights_input_gate.run();
+ }
+ _concat_weights_output.run();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 0e149d4..5174a13 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -92,8 +92,8 @@
// Create Gaussian Pyramid function
_gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
- _convf = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
- _subf = arm_compute::support::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+ _convf.resize(_num_levels);
+ _subf.resize(_num_levels);
for(unsigned int i = 0; i < _num_levels; ++i)
{
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 9ad9689..b2d889b 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -64,8 +64,8 @@
_tmp_pyr.init(pyramid_info);
// Allocate add and scale functions. Level 0 does not need to be scaled.
- _addf = arm_compute::support::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
- _scalef = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
+ _addf.resize(num_levels);
+ _scalef.resize(num_levels - 1);
const size_t last_level = num_levels - 1;
@@ -86,7 +86,7 @@
void NELaplacianReconstruct::run()
{
- ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+ ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 80a2541..d08202d 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -168,7 +168,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run input reshaping
NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
@@ -178,8 +178,6 @@
// Reshape output matrix
NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
-
- _memory_group.release();
}
void NELocallyConnectedLayer::prepare()
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index f00114f..d52e928 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,11 +69,9 @@
void NENormalizationLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
NEScheduler::get().schedule(&_border_handler, Window::DimY);
NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index e90d8f6..0df01c6 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,10 +74,10 @@
const float pyr_scale = old_pyramid->info()->scale();
- _func_scharr = arm_compute::support::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
- _kernel_tracker = arm_compute::support::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
- _scharr_gx = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
- _scharr_gy = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
+ _func_scharr.reserve(_num_levels);
+ _kernel_tracker.reserve(_num_levels);
+ _scharr_gx.reserve(_num_levels);
+ _scharr_gy.reserve(_num_levels);
_old_points_internal = LKInternalKeypointArray(old_points->num_values());
_new_points_internal = LKInternalKeypointArray(old_points->num_values());
@@ -95,25 +95,34 @@
TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
- _scharr_gx[i].allocator()->init(tensor_info);
- _scharr_gy[i].allocator()->init(tensor_info);
+ auto scharr_gx = support::cpp14::make_unique<Tensor>();
+ auto scharr_gy = support::cpp14::make_unique<Tensor>();
+ scharr_gx->allocator()->init(tensor_info);
+ scharr_gy->allocator()->init(tensor_info);
// Manage intermediate buffers
- _memory_group.manage(_scharr_gx.get() + i);
- _memory_group.manage(_scharr_gy.get() + i);
+ _memory_group.manage(scharr_gx.get());
+ _memory_group.manage(scharr_gy.get());
// Init Scharr kernel
- _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
+ auto func_scharr = support::cpp14::make_unique<NEScharr3x3>();
+ func_scharr->configure(old_ith_input, scharr_gx.get(), scharr_gy.get(), border_mode, constant_border_value);
// Init Lucas-Kanade kernel
- _kernel_tracker[i].configure(old_ith_input, new_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i,
- old_points, new_points_estimates, new_points,
- &_old_points_internal, &_new_points_internal,
- termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
- i, _num_levels, pyr_scale);
+ auto kernel_tracker = support::cpp14::make_unique<NELKTrackerKernel>();
+ kernel_tracker->configure(old_ith_input, new_ith_input, scharr_gx.get(), scharr_gy.get(),
+ old_points, new_points_estimates, new_points,
+ &_old_points_internal, &_new_points_internal,
+ termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
+ i, _num_levels, pyr_scale);
- _scharr_gx[i].allocator()->allocate();
- _scharr_gy[i].allocator()->allocate();
+ scharr_gx->allocator()->allocate();
+ scharr_gy->allocator()->allocate();
+
+ _func_scharr.emplace_back(std::move(func_scharr));
+ _kernel_tracker.emplace_back(std::move(kernel_tracker));
+ _scharr_gx.emplace_back(std::move(scharr_gx));
+ _scharr_gy.emplace_back(std::move(scharr_gy));
}
}
@@ -121,16 +130,14 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
for(unsigned int level = _num_levels; level > 0; --level)
{
// Run Scharr kernel
- _func_scharr[level - 1].run();
+ _func_scharr[level - 1].get()->run();
// Run Lucas-Kanade kernel
- NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
+ NEScheduler::get().schedule(_kernel_tracker[level - 1].get(), Window::DimX);
}
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index f5c2718..c608edf 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -25,7 +25,6 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -61,18 +60,28 @@
return coords;
}
+
+uint32_t last_padding_dimension(const PaddingList &padding)
+{
+ int last_padding_dim = padding.size() - 1;
+ for(; last_padding_dim >= 0; --last_padding_dim)
+ {
+ if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+ {
+ break;
+ }
+ }
+ return static_cast<uint32_t>(last_padding_dim);
+}
} // namespace
NEPadLayer::NEPadLayer()
- : _memset_kernel(), _copy_kernel(), _output_subtensor()
+ : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results(), _output_subtensor()
{
}
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_THROW_ON_ERROR(NEPadLayer::validate(input->info(), output->info(), padding, constant_value));
-
// Auto-init
auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
@@ -86,23 +95,235 @@
_copy_kernel.configure(input, &_output_subtensor);
}
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output)
+{
+ // Reflecting can be performed by effectively unfolding the input as follows:
+ // For each dimension starting at DimX:
+ // For before and after:
+ // Use strided slice to extract and reverse the part of the
+ // input / previously produced tensor required for the padding.
+ // Concatenate the before and after padding with the input / previously
+ // produced tensor along the current dimension.
+
+ // Two strided slice functions will be required for each dimension padded as well as a
+ // concatenate function and the tensors to hold the temporary results.
+ _slice_functions.resize(2 * _num_dimensions);
+ _slice_results.resize(2 * _num_dimensions);
+ _concat_functions.resize(_num_dimensions);
+ _concat_results.resize(_num_dimensions - 1);
+
+ Coordinates starts_before{};
+ Coordinates ends_before{};
+ Coordinates starts_after{};
+ Coordinates ends_after{};
+ Coordinates strides{};
+ ITensor *prev = input;
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
+ if(i > 0)
+ {
+ strides.set(i - 1, 1);
+ }
+
+ if(_padding[i].first > 0 || _padding[i].second > 0)
+ {
+ // Set the starts, ends, and strides values for the current dimension.
+ // Due to the bit masks passed to strided slice, the values below the current dimension in
+ // starts and ends will be ignored so do not need to be modified.
+ if(_mode == PaddingMode::REFLECT)
+ {
+ starts_before.set(i, _padding[i].first);
+ ends_before.set(i, 0);
+ starts_after.set(i, input->info()->dimension(i) - 2);
+ ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
+ strides.set(i, -1);
+ }
+ else
+ {
+ starts_before.set(i, _padding[i].first - 1);
+ ends_before.set(i, -1);
+ starts_after.set(i, input->info()->dimension(i) - 1);
+ ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
+ strides.set(i, -1);
+ }
+
+ // Strided slice wraps negative indexes around to the end of the range,
+ // instead this should indicate use of the full range and so the bit mask will be modified.
+ const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t end_mask_before = ends_before[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t begin_mask_after = starts_after[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t end_mask_after = ends_after[i] < 0 ? ~0 : ~(1u << i);
+
+ // Reflect the input values for the padding before and after the input.
+ std::vector<ITensor *> concat_vector;
+ if(_padding[i].first > 0)
+ {
+ if(i < prev->info()->num_dimensions())
+ {
+ _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+ concat_vector.emplace_back(&_slice_results[2 * i]);
+ }
+ else
+ {
+ // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+ concat_vector.push_back(prev);
+ }
+ }
+ concat_vector.push_back(prev);
+ if(_padding[i].second > 0)
+ {
+ if(i < prev->info()->num_dimensions())
+ {
+ _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+ concat_vector.emplace_back(&_slice_results[2 * i + 1]);
+ }
+ else
+ {
+ // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+ concat_vector.push_back(prev);
+ }
+ }
+ // Concatenate the padding before and after with the input.
+ ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
+ _concat_functions[i].configure(concat_vector, out, i);
+ if(i != _num_dimensions - 1)
+ {
+ _concat_results[i].allocator()->allocate();
+ }
+ prev = out;
+ }
+ _slice_results[2 * i].allocator()->allocate();
+ _slice_results[2 * i + 1].allocator()->allocate();
+ }
+}
+
+void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+
+ _padding = padding;
+ _mode = mode;
+
+ const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
+
+ // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
+ _num_dimensions = last_padding_dimension(padding) + 1;
+ if(_num_dimensions > 0)
+ {
+ switch(_mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ configure_constant_mode(input, output, padding, constant_value);
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ configure_reflect_symmetric_mode(input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Padding mode not supported.");
+ }
+ }
+ else
+ {
+ // Copy the input to the whole output if no padding is applied
+ _copy_kernel.configure(input, output);
+ }
+}
+
+Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
{
ARM_COMPUTE_UNUSED(constant_value);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- auto output_clone = output->clone();
+ const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
- SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
- ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+ switch(mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ auto output_clone = output->clone();
+ SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
+ ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ for(uint32_t i = 0; i < padding.size(); ++i)
+ {
+ if(mode == PaddingMode::REFLECT)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
+ }
+ }
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Invalid mode");
+ }
+ }
return Status{};
}
void NEPadLayer::run()
{
- NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
- NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+ if(_num_dimensions > 0)
+ {
+ switch(_mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ if(_padding[i].first > 0 || _padding[i].second > 0)
+ {
+ if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+ {
+ _slice_functions[2 * i].run();
+ }
+ if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+ {
+ _slice_functions[2 * i + 1].run();
+ }
+ _concat_functions[i].run();
+ }
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Padding mode not supported.");
+ }
+ }
+ else
+ {
+ NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index cf6b984..ef28fe9 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,8 +29,8 @@
#include <utility>
-using namespace arm_compute;
-
+namespace arm_compute
+{
void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
@@ -51,3 +51,27 @@
{
return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
}
+
+void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEComplexPixelWiseMultiplicationKernel>();
+ k->configure(input1, input2, output);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 8f7db96..65873b1 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,22 +26,13 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
using namespace arm_compute;
-NEQuantizationLayer::NEQuantizationLayer()
- : _quantize_kernel(), _min_max_kernel(), _min_max()
-{
-}
-
Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
- TensorInfo min_max{ input->num_channels(), input->data_type() };
- ARM_COMPUTE_RETURN_ON_ERROR(NEMinMaxLayerKernel::validate(input, &min_max));
- ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output, &min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output));
return Status{};
}
@@ -50,24 +41,8 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
- _min_max_kernel.configure(input, &_min_max);
-
// Configure quantize kernel
- _quantize_kernel.configure(input, output, &_min_max);
-
- // Allocate min_max tensor
- _min_max.allocator()->allocate();
-}
-
-void NEQuantizationLayer::run()
-{
- // Reset min and max
- _min_max_kernel.reset();
-
- // Run min and max kernel
- NEScheduler::get().schedule(&_min_max_kernel, Window::DimY);
-
- // Run quantize kernel
- NEScheduler::get().schedule(&_quantize_kernel, Window::DimY);
+ auto k = arm_compute::support::cpp14::make_unique<NEQuantizationLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 995d5ee..9ca7ded 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -104,7 +104,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_fully_connected_kernel.run();
@@ -115,8 +115,6 @@
// copy hidden out to output
NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
-
- _memory_group.release();
}
void NERNNLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 014895f..0b145f0 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -37,6 +38,8 @@
{
ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
TensorShape out_shape = input->tensor_shape();
@@ -78,10 +81,10 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- _reduction_ops = reduction_axis.num_dimensions();
- _reduction_kernels = arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
- _reduced_outs = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
- _keep_dims = keep_dims;
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels.resize(_reduction_ops);
+ _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
Coordinates axis_local = reduction_axis;
const int input_dims = input->info()->num_dimensions();
@@ -96,9 +99,9 @@
// Perform reduction for every axis
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
- TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+ TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
- auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+ auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
if(i == _reduction_ops - 1 && keep_dims)
{
@@ -107,8 +110,8 @@
else
{
_reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
- _memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
+ _memory_group.manage(&_reduced_outs[i]);
+ _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -131,13 +134,13 @@
out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
- _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+ _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
}
}
void NEReduceMean::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
@@ -148,5 +151,4 @@
{
_reshape.run();
}
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 9f81a40..a0aed96 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -66,7 +66,8 @@
void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op));
// Configure reduction kernel
_reduction_kernel.configure(input, output, axis, op);
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 483aa4c..425ee6c 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -97,14 +97,17 @@
_dx(),
_dy(),
_scale_kernel(),
- _border_handler()
+ _border_handler(),
+ _use_padding(true)
{
}
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy));
+ ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy, use_padding));
+
+ _use_padding = use_padding;
// Get data layout and width/height indices
const DataLayout data_layout = input->info()->data_layout();
@@ -134,7 +137,7 @@
TensorInfo tensor_info_offsets(shape, Format::S32);
_offsets.allocator()->init(tensor_info_offsets);
- _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, sampling_policy);
+ _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -152,7 +155,7 @@
_dx.allocator()->init(tensor_info_dxdy);
_dy.allocator()->init(tensor_info_dxdy);
- _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, sampling_policy);
+ _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -165,18 +168,20 @@
}
case InterpolationPolicy::AREA:
{
- _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode);
+ _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode, constant_border_value);
break;
}
default:
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
-
- _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
+ if(use_padding)
+ {
+ _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
+ }
}
Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
- BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+ BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
@@ -213,12 +218,15 @@
}
ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(),
- policy, border_mode, sampling_policy));
+ policy, border_mode, constant_border_value, sampling_policy, use_padding));
return Status{};
}
void NEScale::run()
{
- NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+ if(_use_padding)
+ {
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+ }
NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index d8f4eda..2ddfee5 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,10 +81,8 @@
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 5b6f60b..b47a37a 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,10 +81,8 @@
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 36b7d47..79a9496 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -180,7 +180,7 @@
void NESoftmaxLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_needs_flattening)
{
@@ -195,7 +195,5 @@
{
NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
}
-
- _memory_group.release();
}
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
new file mode 100644
index 0000000..46c28ad
--- /dev/null
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NESpaceToBatchLayer::NESpaceToBatchLayer()
+ : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+{
+}
+
+void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+
+ if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
+ }
+ _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
+ }
+ _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+ return Status{};
+}
+
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+ return Status{};
+}
+
+void NESpaceToBatchLayer::run()
+{
+ // Zero out output only if we have paddings
+ if(_has_padding)
+ {
+ NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+ }
+ NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index e947657..0373ab6 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -42,8 +42,8 @@
void NESplit::configure(const ITensor *input, const std::vector<ITensor *> &outputs, unsigned int axis)
{
// Create Slice functions
- _num_outputs = outputs.size();
- _slice_functions = arm_compute::support::cpp14::make_unique<NESlice[]>(_num_outputs);
+ _num_outputs = outputs.size();
+ _slice_functions.resize(_num_outputs);
// Get output shape
const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 2f49c22..32350b0 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -43,8 +43,8 @@
void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
{
- _num_inputs = input.size();
- _stack_kernels = arm_compute::support::cpp14::make_unique<NEStackLayerKernel[]>(_num_inputs);
+ _num_inputs = input.size();
+ _stack_kernels.resize(_num_inputs);
// Wrap around negative values
const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 7532020..21f35f8 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -74,7 +74,7 @@
// Wrap around negative values
const unsigned int axis_u = wrap_axis(axis, input->info());
_num_slices = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
- _strided_slice_vector = arm_compute::support::cpp14::make_unique<NEStridedSlice[]>(_num_slices);
+ _strided_slice_vector.resize(_num_slices);
Coordinates slice_start;
int32_t slice_end_mask;
diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
index 7e435c3..25b5216 100644
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
@@ -40,14 +40,15 @@
{
}
-Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+template <typename TensorInfoType, typename>
+inline Status NEWidthConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
unsigned int width_offset = 0;
@@ -60,8 +61,8 @@
return Status{};
}
-
-void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+template <typename TensorType, typename>
+inline void NEWidthConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output)
{
_num_inputs = inputs_vector.size();
@@ -70,7 +71,7 @@
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -78,7 +79,7 @@
unsigned int width_offset = 0;
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
+ _concat_kernels_vector.resize(_num_inputs);
for(unsigned int i = 0; i < _num_inputs; ++i)
{
@@ -87,10 +88,30 @@
}
}
+void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+ configure_internal(std::move(inputs_vector), output);
+}
+
+void NEWidthConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output)
+{
+ configure_internal(std::move(inputs_vector), output);
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+ return validate_internal(inputs_vector, output);
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+ return validate_internal(inputs_vector, output);
+}
+
void NEWidthConcatenateLayer::run()
{
for(unsigned i = 0; i < _num_inputs; ++i)
{
- NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
+ NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index e37f8ab..1513786 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "support/ToolchainSupport.h"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp"
namespace arm_compute
{
@@ -162,7 +162,7 @@
const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
const int in_batches = input->info()->dimension(3);
- return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+ return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
}
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
@@ -234,12 +234,12 @@
} //namespace
-NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
: _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
- _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
- _is_prepared(false), _is_activationlayer_enabled(false)
+ _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
+ _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
{
-} /* arm_compute */
+}
void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
bool enable_fast_math)
@@ -380,20 +380,17 @@
// Kernel Storage
const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
in_channels)
- * data_type_size
- + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+ * data_type_size;
// Input storage
const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
use_same_padding)
- * data_type_size
- + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+ * data_type_size;
// Output storage
const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
use_same_padding)
- * data_type_size
- + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+ * data_type_size;
;
const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
@@ -431,14 +428,16 @@
d_strides.set(2, 0);
d_strides.set(3, data_type_size * output_matrix_stride);
- TensorInfo a_info, b_info, d_info;
+ TensorInfo a_info{};
+ TensorInfo b_info{};
+ TensorInfo d_info{};
a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
- _input_workspace.allocator()->init(a_info, storage_alignment);
+ _input_transformed.allocator()->init(a_info, storage_alignment);
_kernel_storage.allocator()->init(b_info, storage_alignment);
- _output_workspace.allocator()->init(d_info, storage_alignment);
+ _output_transformed.allocator()->init(d_info, storage_alignment);
// configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
@@ -446,47 +445,58 @@
1, _output->info()->data_type());
_output_nhwc.allocator()->init(info);
- // Configure the InputTransform
- _memory_group.manage(&_input_workspace);
- _memory_group.manage(&_output_workspace);
+ const ITensor *input_to_use = _input;
+ ITensor *output_to_use = _output;
+ PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
+ const unsigned int max_num_threads = NEScheduler::get().num_threads();
+ // Configure the kernel to transform the input tensor from NCHW -> NHWC
if(data_layout == DataLayout::NCHW)
{
- // configure the kernel to transform the input tensor from NCHW -> NHWC
+ _memory_group.manage(&_input_nhwc);
_permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
- _input_nhwc.allocator()->allocate();
- transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- &_input_workspace, input_matrix_stride);
-
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
-
- transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
- //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
- _memory_group.manage(&_output_nhwc);
- transform_output_kernel->configure(biases, &_output_workspace,
- output_matrix_stride, &_output_nhwc,
- in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
- }
- else
- {
- transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- &_input_workspace, input_matrix_stride);
-
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
-
- transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
- transform_output_kernel->configure(biases, &_output_workspace,
- output_matrix_stride, _output,
- in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+ input_to_use = &_input_nhwc;
+ weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
}
- _gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
+ // Configure input transform kernel
+ _memory_group.manage(&_input_transformed);
+ _memory_group.manage(&_input_workspace);
+ transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+ &_input_transformed, input_matrix_stride, &_input_workspace);
+ const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
+ TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
+ _input_workspace.allocator()->init(input_workspace_info);
_input_workspace.allocator()->allocate();
+ if(data_layout == DataLayout::NCHW)
+ {
+ _input_nhwc.allocator()->allocate();
+ }
+
+ // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+ _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
+ transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+
+ // Configure GEMM function
+ _memory_group.manage(&_output_transformed);
+ _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
+ _input_transformed.allocator()->allocate();
+
+ // Configure output transform function
+ // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+ if(data_layout == DataLayout::NCHW)
+ {
+ _memory_group.manage(&_output_nhwc);
+ output_to_use = &_output_nhwc;
+ }
+ transform_output_kernel->configure(biases, &_output_transformed,
+ output_matrix_stride, output_to_use,
+ in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
+ const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
+ TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
+ _output_workspace.allocator()->init(output_workspace_info);
_output_workspace.allocator()->allocate();
+ _output_transformed.allocator()->allocate();
// Reorder the convoluted output to ACL's ordering NCHW
if(data_layout == DataLayout::NCHW)
@@ -513,7 +523,7 @@
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(data_layout == DataLayout::NCHW)
{
@@ -526,6 +536,7 @@
//Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
_gemm_function.run();
+
// Transform output tensor to the spatial domain
NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
@@ -539,8 +550,6 @@
{
_activationlayer_function.run();
}
-
- _memory_group.release();
}
Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
new file mode 100644
index 0000000..049bf66
--- /dev/null
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor *input,
+ const ITensor *weights,
+ ITensor *output,
+ PadStrideInfo conv_info,
+ ActivationLayerInfo act_info)
+{
+ const DataType data_type = input->info()->data_type();
+ const TensorShape shape = input->info()->tensor_shape();
+
+ const int n_batches = shape[3];
+ const int in_rows = shape.z();
+ const int in_cols = shape.y();
+ const int n_channels = shape.x();
+ const int padding_top = conv_info.pad_top();
+ const int padding_left = conv_info.pad_left();
+ const int padding_bottom = conv_info.pad_bottom();
+ const int padding_right = conv_info.pad_right();
+
+ const unsigned int stride_x = conv_info.stride().first;
+
+ // Map activation function
+ neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
+ if(arm_compute::utils::info_helpers::is_relu(act_info))
+ {
+ activation = neon_convolution_kernels::ActivationFunction::ReLU;
+ }
+ else if(arm_compute::utils::info_helpers::is_relu6(act_info))
+ {
+ activation = neon_convolution_kernels::ActivationFunction::ReLU6;
+ }
+
+ // Create quantized convolver
+ if(data_type == DataType::QASYMM8)
+ {
+ const QuantizationInfo &input_qinfo = input->info()->quantization_info();
+ const QuantizationInfo &weights_qinfo = weights->info()->quantization_info();
+ const QuantizationInfo &output_qinfo = output->info()->quantization_info();
+
+ // Check that quantization info are in the range [0, 255]
+ ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+ ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
+ ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+ const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+ const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
+ const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+ // Calculate rescale parameters
+ const float fmultipler = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int qmultiplier = 0;
+ int qshift = 0;
+ quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+ qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
+
+ // Create convolver
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+ n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+ n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+ default:
+ return nullptr;
+ }
+ }
+ else
+ {
+ // Create float convolver
+ switch(data_type)
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
+ n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
+ n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ default:
+ return nullptr;
+ }
+ break;
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ {
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
+ n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
+ n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ default:
+ return nullptr;
+ }
+ break;
+ }
+ default:
+ return nullptr;
+ }
+ }
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false), _dwc_assembly_kernel(nullptr),
+ _dwc_acl_kernel()
+{
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_UNUSED(depth_multiplier);
+ ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(),
+ weights->info(),
+ bias != nullptr ? bias->info() : nullptr,
+ output->info(),
+ conv_info,
+ depth_multiplier,
+ act_info));
+
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+
+ _input = input;
+ _weights = weights;
+ _bias = bias;
+ _output = output;
+ _is_prepared = false;
+
+ // Create convolver
+ _dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info);
+ ARM_COMPUTE_ERROR_ON(_dwc_assembly_kernel == nullptr);
+
+ // Create assembly kernel wrapper
+ _dwc_acl_kernel.configure(_dwc_assembly_kernel.get());
+
+ constexpr size_t alignment = 128;
+
+ // Create workspace
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+ const size_t workspace_size = _dwc_assembly_kernel->get_working_space_size(num_threads);
+ ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
+ _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
+ _memory_group.manage(&_workspace);
+ _workspace.allocator()->allocate();
+
+ // Create packing tensor
+ const size_t pack_tensor_size = _dwc_assembly_kernel->get_packed_params_size();
+ ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
+ _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
+}
+
+Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ const auto strides = conv_info.stride();
+ const DataLayout data_layout = input->data_layout();
+ unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(!((strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2))));
+ ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1);
+
+ const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
+ const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6));
+
+ // Check bias
+ if(bias != nullptr)
+ {
+ unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
+ }
+
+ // Check output
+ if(output->total_size() != 0)
+ {
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ PadStrideInfo conv_info,
+ unsigned int depth_multiplier,
+ const Size2D &dilation)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+
+ // Reshape input shape if in NHWC format
+ const DataLayout data_layout = input->data_layout();
+ TensorShape in_shape{ input->tensor_shape() };
+ if(data_layout == DataLayout::NHWC)
+ {
+ in_shape.set(Window::DimX, input->tensor_shape().y());
+ in_shape.set(Window::DimY, input->tensor_shape().z());
+ in_shape.set(Window::DimZ, input->tensor_shape().x());
+ }
+
+ // Check data type
+ const DataType data_type = weights->data_type();
+ bool is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type);
+
+ // Check weighs size
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ bool weights_supported = (weights->dimension(width_idx) == 3) && (weights->dimension(height_idx) == 3);
+
+ // Check for supported strides
+ const auto &strides = conv_info.stride();
+ bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
+
+ // Check for supported padding
+ const auto pad_top = conv_info.pad_top();
+ const auto pad_right = conv_info.pad_right();
+ const auto pad_bottom = conv_info.pad_bottom();
+ const auto pad_left = conv_info.pad_left();
+ PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
+ bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+ bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+ bool supported_padding = is_same_padding || is_valid_padding;
+ bool is_dilation_1 = dilation.x() == 1 && dilation.y() == 1;
+
+ return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_1;
+}
+
+void NEDepthwiseConvolutionAssemblyDispatch::run()
+{
+ // Prepare assembly kernel
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Setup inputs/outputs
+ ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
+ _dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
+
+ ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+ const int input_element_size = _input->info()->element_size();
+ const int input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size;
+ const int input_row_stride = _input->info()->strides_in_bytes().z() / input_element_size;
+ const int input_col_stride = _input->info()->strides_in_bytes().y() / input_element_size;
+ const void *input_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes();
+ _dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+
+ ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+ const int output_element_size = _output->info()->element_size();
+ const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size;
+ const int output_row_stride = _output->info()->strides_in_bytes().z() / output_element_size;
+ const int output_col_stride = _output->info()->strides_in_bytes().y() / output_element_size;
+ void *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes();
+ _dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+
+ // Schedule assembly kernel
+ NEScheduler::get().schedule(&_dwc_acl_kernel, Window::DimX);
+}
+
+void NEDepthwiseConvolutionAssemblyDispatch::prepare()
+{
+ if(!_is_prepared)
+ {
+ _packed_weights.allocator()->allocate();
+ ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr);
+
+ // Pack weights and bias
+ const int weights_element_size = _weights->info()->element_size();
+ const int weights_row_stride = _weights->info()->strides_in_bytes().z() / weights_element_size;
+ const int weights_col_stride = _weights->info()->strides_in_bytes().y() / weights_element_size;
+ _dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
+ _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
+ weights_row_stride,
+ weights_col_stride,
+ (_bias != nullptr) ? _bias->buffer() : nullptr);
+ _dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
+
+ _weights->mark_as_unused();
+ if(_bias != nullptr)
+ {
+ _bias->mark_as_unused();
+ }
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index 34aaea0..e207ab0 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -183,9 +183,8 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().run_tagged_workloads(_workloads, _tag.c_str());
- _memory_group.release();
}
void NEGEMMInterleavedWrapper::prepare()
diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
index ebd6570..bc7b550 100644
--- a/src/runtime/Pyramid.cpp
+++ b/src/runtime/Pyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,8 +45,8 @@
void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
{
- _info = info;
- _pyramid = arm_compute::support::cpp14::make_unique<Tensor[]>(_info.num_levels());
+ _info = info;
+ _pyramid.resize(_info.num_levels());
size_t w = _info.width();
size_t h = _info.height();
@@ -56,11 +56,11 @@
TensorShape tensor_shape = _info.tensor_shape();
// Note: Look-up table used by the OpenVX sample implementation
- const float c_orbscale[4] = { 0.5f,
- SCALE_PYRAMID_ORB,
- SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
- SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
- };
+ const std::array<float, 4> c_orbscale = { 0.5f,
+ SCALE_PYRAMID_ORB,
+ SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
+ SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
+ };
for(size_t i = 0; i < _info.num_levels(); ++i)
{
@@ -71,7 +71,7 @@
tensor_info.auto_padding();
}
- (_pyramid.get() + i)->allocator()->init(tensor_info);
+ _pyramid[i].allocator()->init(tensor_info);
if(is_orb_scale)
{
@@ -99,11 +99,9 @@
void Pyramid::allocate()
{
- ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
-
for(size_t i = 0; i < _info.num_levels(); ++i)
{
- (_pyramid.get() + i)->allocator()->allocate();
+ _pyramid[i].allocator()->allocate();
}
}
@@ -116,5 +114,5 @@
{
ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
- return (_pyramid.get() + index);
-}
+ return &_pyramid[index];
+}
\ No newline at end of file
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 38edb8b..0612d75 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -149,11 +149,11 @@
info().set_is_resizable(true);
}
-arm_compute::Status TensorAllocator::import_memory(void *memory, size_t size)
+Status TensorAllocator::import_memory(void *memory)
{
ARM_COMPUTE_RETURN_ERROR_ON(memory == nullptr);
- ARM_COMPUTE_RETURN_ERROR_ON(size == 0);
ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(alignment() != 0 && !arm_compute::utility::check_aligned(memory, alignment()));
_memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(memory, info().total_size()));
info().set_is_resizable(false);